mirror of
https://github.com/AUTOMATIC1111/stable-diffusion-webui.git
synced 2025-08-09 13:49:48 +00:00
images history improvement
This commit is contained in:
@@ -36,6 +36,7 @@ errors.run(enable_tf32, "Enabling TF32")
|
||||
|
||||
device = device_gfpgan = device_bsrgan = device_esrgan = device_scunet = device_codeformer = get_optimal_device()
|
||||
dtype = torch.float16
|
||||
dtype_vae = torch.float16
|
||||
|
||||
def randn(seed, shape):
|
||||
# Pytorch currently doesn't handle setting randomness correctly when the metal backend is used.
|
||||
@@ -59,9 +60,12 @@ def randn_without_seed(shape):
|
||||
return torch.randn(shape, device=device)
|
||||
|
||||
|
||||
def autocast():
|
||||
def autocast(disable=False):
|
||||
from modules import shared
|
||||
|
||||
if disable:
|
||||
return contextlib.nullcontext()
|
||||
|
||||
if dtype == torch.float32 or shared.cmd_opts.precision == "full":
|
||||
return contextlib.nullcontext()
|
||||
|
||||
|
@@ -1,98 +0,0 @@
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import torch
|
||||
|
||||
from ldm.util import default
|
||||
from modules import devices, shared
|
||||
import torch
|
||||
from torch import einsum
|
||||
from einops import rearrange, repeat
|
||||
|
||||
|
||||
class HypernetworkModule(torch.nn.Module):
|
||||
def __init__(self, dim, state_dict):
|
||||
super().__init__()
|
||||
|
||||
self.linear1 = torch.nn.Linear(dim, dim * 2)
|
||||
self.linear2 = torch.nn.Linear(dim * 2, dim)
|
||||
|
||||
self.load_state_dict(state_dict, strict=True)
|
||||
self.to(devices.device)
|
||||
|
||||
def forward(self, x):
|
||||
return x + (self.linear2(self.linear1(x)))
|
||||
|
||||
|
||||
class Hypernetwork:
|
||||
filename = None
|
||||
name = None
|
||||
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
self.name = os.path.splitext(os.path.basename(filename))[0]
|
||||
self.layers = {}
|
||||
|
||||
state_dict = torch.load(filename, map_location='cpu')
|
||||
for size, sd in state_dict.items():
|
||||
self.layers[size] = (HypernetworkModule(size, sd[0]), HypernetworkModule(size, sd[1]))
|
||||
|
||||
|
||||
def list_hypernetworks(path):
|
||||
res = {}
|
||||
for filename in glob.iglob(os.path.join(path, '**/*.pt'), recursive=True):
|
||||
name = os.path.splitext(os.path.basename(filename))[0]
|
||||
res[name] = filename
|
||||
return res
|
||||
|
||||
|
||||
def load_hypernetwork(filename):
|
||||
path = shared.hypernetworks.get(filename, None)
|
||||
if path is not None:
|
||||
print(f"Loading hypernetwork {filename}")
|
||||
try:
|
||||
shared.loaded_hypernetwork = Hypernetwork(path)
|
||||
except Exception:
|
||||
print(f"Error loading hypernetwork {path}", file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
else:
|
||||
if shared.loaded_hypernetwork is not None:
|
||||
print(f"Unloading hypernetwork")
|
||||
|
||||
shared.loaded_hypernetwork = None
|
||||
|
||||
|
||||
def attention_CrossAttention_forward(self, x, context=None, mask=None):
|
||||
h = self.heads
|
||||
|
||||
q = self.to_q(x)
|
||||
context = default(context, x)
|
||||
|
||||
hypernetwork = shared.loaded_hypernetwork
|
||||
hypernetwork_layers = (hypernetwork.layers if hypernetwork is not None else {}).get(context.shape[2], None)
|
||||
|
||||
if hypernetwork_layers is not None:
|
||||
k = self.to_k(hypernetwork_layers[0](context))
|
||||
v = self.to_v(hypernetwork_layers[1](context))
|
||||
else:
|
||||
k = self.to_k(context)
|
||||
v = self.to_v(context)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
|
||||
|
||||
sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
|
||||
|
||||
if mask is not None:
|
||||
mask = rearrange(mask, 'b ... -> b (...)')
|
||||
max_neg_value = -torch.finfo(sim.dtype).max
|
||||
mask = repeat(mask, 'b j -> (b h) () j', h=h)
|
||||
sim.masked_fill_(~mask, max_neg_value)
|
||||
|
||||
# attention, what we cannot get enough of
|
||||
attn = sim.softmax(dim=-1)
|
||||
|
||||
out = einsum('b i j, b j d -> b i d', attn, v)
|
||||
out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
|
||||
return self.to_out(out)
|
283
modules/hypernetwork/hypernetwork.py
Normal file
283
modules/hypernetwork/hypernetwork.py
Normal file
@@ -0,0 +1,283 @@
|
||||
import datetime
|
||||
import glob
|
||||
import html
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
import tqdm
|
||||
|
||||
import torch
|
||||
|
||||
from ldm.util import default
|
||||
from modules import devices, shared, processing, sd_models
|
||||
import torch
|
||||
from torch import einsum
|
||||
from einops import rearrange, repeat
|
||||
import modules.textual_inversion.dataset
|
||||
|
||||
|
||||
class HypernetworkModule(torch.nn.Module):
|
||||
def __init__(self, dim, state_dict=None):
|
||||
super().__init__()
|
||||
|
||||
self.linear1 = torch.nn.Linear(dim, dim * 2)
|
||||
self.linear2 = torch.nn.Linear(dim * 2, dim)
|
||||
|
||||
if state_dict is not None:
|
||||
self.load_state_dict(state_dict, strict=True)
|
||||
else:
|
||||
|
||||
self.linear1.weight.data.normal_(mean=0.0, std=0.01)
|
||||
self.linear1.bias.data.zero_()
|
||||
self.linear2.weight.data.normal_(mean=0.0, std=0.01)
|
||||
self.linear2.bias.data.zero_()
|
||||
|
||||
self.to(devices.device)
|
||||
|
||||
def forward(self, x):
|
||||
return x + (self.linear2(self.linear1(x)))
|
||||
|
||||
|
||||
class Hypernetwork:
|
||||
filename = None
|
||||
name = None
|
||||
|
||||
def __init__(self, name=None):
|
||||
self.filename = None
|
||||
self.name = name
|
||||
self.layers = {}
|
||||
self.step = 0
|
||||
self.sd_checkpoint = None
|
||||
self.sd_checkpoint_name = None
|
||||
|
||||
for size in [320, 640, 768, 1280]:
|
||||
self.layers[size] = (HypernetworkModule(size), HypernetworkModule(size))
|
||||
|
||||
def weights(self):
|
||||
res = []
|
||||
|
||||
for k, layers in self.layers.items():
|
||||
for layer in layers:
|
||||
layer.train()
|
||||
res += [layer.linear1.weight, layer.linear1.bias, layer.linear2.weight, layer.linear2.bias]
|
||||
|
||||
return res
|
||||
|
||||
def save(self, filename):
|
||||
state_dict = {}
|
||||
|
||||
for k, v in self.layers.items():
|
||||
state_dict[k] = (v[0].state_dict(), v[1].state_dict())
|
||||
|
||||
state_dict['step'] = self.step
|
||||
state_dict['name'] = self.name
|
||||
state_dict['sd_checkpoint'] = self.sd_checkpoint
|
||||
state_dict['sd_checkpoint_name'] = self.sd_checkpoint_name
|
||||
|
||||
torch.save(state_dict, filename)
|
||||
|
||||
def load(self, filename):
|
||||
self.filename = filename
|
||||
if self.name is None:
|
||||
self.name = os.path.splitext(os.path.basename(filename))[0]
|
||||
|
||||
state_dict = torch.load(filename, map_location='cpu')
|
||||
|
||||
for size, sd in state_dict.items():
|
||||
if type(size) == int:
|
||||
self.layers[size] = (HypernetworkModule(size, sd[0]), HypernetworkModule(size, sd[1]))
|
||||
|
||||
self.name = state_dict.get('name', self.name)
|
||||
self.step = state_dict.get('step', 0)
|
||||
self.sd_checkpoint = state_dict.get('sd_checkpoint', None)
|
||||
self.sd_checkpoint_name = state_dict.get('sd_checkpoint_name', None)
|
||||
|
||||
|
||||
def list_hypernetworks(path):
|
||||
res = {}
|
||||
for filename in glob.iglob(os.path.join(path, '**/*.pt'), recursive=True):
|
||||
name = os.path.splitext(os.path.basename(filename))[0]
|
||||
res[name] = filename
|
||||
return res
|
||||
|
||||
|
||||
def load_hypernetwork(filename):
|
||||
path = shared.hypernetworks.get(filename, None)
|
||||
if path is not None:
|
||||
print(f"Loading hypernetwork {filename}")
|
||||
try:
|
||||
shared.loaded_hypernetwork = Hypernetwork()
|
||||
shared.loaded_hypernetwork.load(path)
|
||||
|
||||
except Exception:
|
||||
print(f"Error loading hypernetwork {path}", file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
else:
|
||||
if shared.loaded_hypernetwork is not None:
|
||||
print(f"Unloading hypernetwork")
|
||||
|
||||
shared.loaded_hypernetwork = None
|
||||
|
||||
|
||||
def apply_hypernetwork(hypernetwork, context, layer=None):
|
||||
hypernetwork_layers = (hypernetwork.layers if hypernetwork is not None else {}).get(context.shape[2], None)
|
||||
|
||||
if hypernetwork_layers is None:
|
||||
return context, context
|
||||
|
||||
if layer is not None:
|
||||
layer.hyper_k = hypernetwork_layers[0]
|
||||
layer.hyper_v = hypernetwork_layers[1]
|
||||
|
||||
context_k = hypernetwork_layers[0](context)
|
||||
context_v = hypernetwork_layers[1](context)
|
||||
return context_k, context_v
|
||||
|
||||
|
||||
def attention_CrossAttention_forward(self, x, context=None, mask=None):
|
||||
h = self.heads
|
||||
|
||||
q = self.to_q(x)
|
||||
context = default(context, x)
|
||||
|
||||
context_k, context_v = apply_hypernetwork(shared.loaded_hypernetwork, context, self)
|
||||
k = self.to_k(context_k)
|
||||
v = self.to_v(context_v)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
|
||||
|
||||
sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
|
||||
|
||||
if mask is not None:
|
||||
mask = rearrange(mask, 'b ... -> b (...)')
|
||||
max_neg_value = -torch.finfo(sim.dtype).max
|
||||
mask = repeat(mask, 'b j -> (b h) () j', h=h)
|
||||
sim.masked_fill_(~mask, max_neg_value)
|
||||
|
||||
# attention, what we cannot get enough of
|
||||
attn = sim.softmax(dim=-1)
|
||||
|
||||
out = einsum('b i j, b j d -> b i d', attn, v)
|
||||
out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
|
||||
return self.to_out(out)
|
||||
|
||||
|
||||
def train_hypernetwork(hypernetwork_name, learn_rate, data_root, log_directory, steps, create_image_every, save_hypernetwork_every, template_file, preview_image_prompt):
|
||||
assert hypernetwork_name, 'embedding not selected'
|
||||
|
||||
path = shared.hypernetworks.get(hypernetwork_name, None)
|
||||
shared.loaded_hypernetwork = Hypernetwork()
|
||||
shared.loaded_hypernetwork.load(path)
|
||||
|
||||
shared.state.textinfo = "Initializing hypernetwork training..."
|
||||
shared.state.job_count = steps
|
||||
|
||||
filename = os.path.join(shared.cmd_opts.hypernetwork_dir, f'{hypernetwork_name}.pt')
|
||||
|
||||
log_directory = os.path.join(log_directory, datetime.datetime.now().strftime("%Y-%m-%d"), hypernetwork_name)
|
||||
|
||||
if save_hypernetwork_every > 0:
|
||||
hypernetwork_dir = os.path.join(log_directory, "hypernetworks")
|
||||
os.makedirs(hypernetwork_dir, exist_ok=True)
|
||||
else:
|
||||
hypernetwork_dir = None
|
||||
|
||||
if create_image_every > 0:
|
||||
images_dir = os.path.join(log_directory, "images")
|
||||
os.makedirs(images_dir, exist_ok=True)
|
||||
else:
|
||||
images_dir = None
|
||||
|
||||
cond_model = shared.sd_model.cond_stage_model
|
||||
|
||||
shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
|
||||
with torch.autocast("cuda"):
|
||||
ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=512, height=512, repeats=1, placeholder_token=hypernetwork_name, model=shared.sd_model, device=devices.device, template_file=template_file)
|
||||
|
||||
hypernetwork = shared.loaded_hypernetwork
|
||||
weights = hypernetwork.weights()
|
||||
for weight in weights:
|
||||
weight.requires_grad = True
|
||||
|
||||
optimizer = torch.optim.AdamW(weights, lr=learn_rate)
|
||||
|
||||
losses = torch.zeros((32,))
|
||||
|
||||
last_saved_file = "<none>"
|
||||
last_saved_image = "<none>"
|
||||
|
||||
ititial_step = hypernetwork.step or 0
|
||||
if ititial_step > steps:
|
||||
return hypernetwork, filename
|
||||
|
||||
pbar = tqdm.tqdm(enumerate(ds), total=steps - ititial_step)
|
||||
for i, (x, text) in pbar:
|
||||
hypernetwork.step = i + ititial_step
|
||||
|
||||
if hypernetwork.step > steps:
|
||||
break
|
||||
|
||||
if shared.state.interrupted:
|
||||
break
|
||||
|
||||
with torch.autocast("cuda"):
|
||||
c = cond_model([text])
|
||||
|
||||
x = x.to(devices.device)
|
||||
loss = shared.sd_model(x.unsqueeze(0), c)[0]
|
||||
del x
|
||||
|
||||
losses[hypernetwork.step % losses.shape[0]] = loss.item()
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
pbar.set_description(f"loss: {losses.mean():.7f}")
|
||||
|
||||
if hypernetwork.step > 0 and hypernetwork_dir is not None and hypernetwork.step % save_hypernetwork_every == 0:
|
||||
last_saved_file = os.path.join(hypernetwork_dir, f'{hypernetwork_name}-{hypernetwork.step}.pt')
|
||||
hypernetwork.save(last_saved_file)
|
||||
|
||||
if hypernetwork.step > 0 and images_dir is not None and hypernetwork.step % create_image_every == 0:
|
||||
last_saved_image = os.path.join(images_dir, f'{hypernetwork_name}-{hypernetwork.step}.png')
|
||||
|
||||
preview_text = text if preview_image_prompt == "" else preview_image_prompt
|
||||
|
||||
p = processing.StableDiffusionProcessingTxt2Img(
|
||||
sd_model=shared.sd_model,
|
||||
prompt=preview_text,
|
||||
steps=20,
|
||||
do_not_save_grid=True,
|
||||
do_not_save_samples=True,
|
||||
)
|
||||
|
||||
processed = processing.process_images(p)
|
||||
image = processed.images[0]
|
||||
|
||||
shared.state.current_image = image
|
||||
image.save(last_saved_image)
|
||||
|
||||
last_saved_image += f", prompt: {preview_text}"
|
||||
|
||||
shared.state.job_no = hypernetwork.step
|
||||
|
||||
shared.state.textinfo = f"""
|
||||
<p>
|
||||
Loss: {losses.mean():.7f}<br/>
|
||||
Step: {hypernetwork.step}<br/>
|
||||
Last prompt: {html.escape(text)}<br/>
|
||||
Last saved embedding: {html.escape(last_saved_file)}<br/>
|
||||
Last saved image: {html.escape(last_saved_image)}<br/>
|
||||
</p>
|
||||
"""
|
||||
|
||||
checkpoint = sd_models.select_checkpoint()
|
||||
|
||||
hypernetwork.sd_checkpoint = checkpoint.hash
|
||||
hypernetwork.sd_checkpoint_name = checkpoint.model_name
|
||||
hypernetwork.save(filename)
|
||||
|
||||
return hypernetwork, filename
|
||||
|
||||
|
43
modules/hypernetwork/ui.py
Normal file
43
modules/hypernetwork/ui.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import html
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
|
||||
import modules.textual_inversion.textual_inversion
|
||||
import modules.textual_inversion.preprocess
|
||||
from modules import sd_hijack, shared
|
||||
from modules.hypernetwork import hypernetwork
|
||||
|
||||
|
||||
def create_hypernetwork(name):
|
||||
fn = os.path.join(shared.cmd_opts.hypernetwork_dir, f"{name}.pt")
|
||||
assert not os.path.exists(fn), f"file {fn} already exists"
|
||||
|
||||
hypernet = modules.hypernetwork.hypernetwork.Hypernetwork(name=name)
|
||||
hypernet.save(fn)
|
||||
|
||||
shared.reload_hypernetworks()
|
||||
|
||||
return gr.Dropdown.update(choices=sorted([x for x in shared.hypernetworks.keys()])), f"Created: {fn}", ""
|
||||
|
||||
|
||||
def train_hypernetwork(*args):
|
||||
|
||||
initial_hypernetwork = shared.loaded_hypernetwork
|
||||
|
||||
try:
|
||||
sd_hijack.undo_optimizations()
|
||||
|
||||
hypernetwork, filename = modules.hypernetwork.hypernetwork.train_hypernetwork(*args)
|
||||
|
||||
res = f"""
|
||||
Training {'interrupted' if shared.state.interrupted else 'finished'} at {hypernetwork.step} steps.
|
||||
Hypernetwork saved to {html.escape(filename)}
|
||||
"""
|
||||
return res, ""
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
shared.loaded_hypernetwork = initial_hypernetwork
|
||||
sd_hijack.apply_optimizations()
|
||||
|
@@ -207,7 +207,7 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see
|
||||
# enables the generation of additional tensors with noise that the sampler will use during its processing.
|
||||
# Using those pre-generated tensors instead of simple torch.randn allows a batch with seeds [100, 101] to
|
||||
# produce the same images as with two batches [100], [101].
|
||||
if p is not None and p.sampler is not None and len(seeds) > 1 and opts.enable_batch_seeds:
|
||||
if p is not None and p.sampler is not None and (len(seeds) > 1 and opts.enable_batch_seeds or opts.eta_noise_seed_delta > 0):
|
||||
sampler_noises = [[] for _ in range(p.sampler.number_of_needed_noises(p))]
|
||||
else:
|
||||
sampler_noises = None
|
||||
@@ -247,6 +247,9 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see
|
||||
if sampler_noises is not None:
|
||||
cnt = p.sampler.number_of_needed_noises(p)
|
||||
|
||||
if opts.eta_noise_seed_delta > 0:
|
||||
torch.manual_seed(seed + opts.eta_noise_seed_delta)
|
||||
|
||||
for j in range(cnt):
|
||||
sampler_noises[j].append(devices.randn_without_seed(tuple(noise_shape)))
|
||||
|
||||
@@ -259,6 +262,13 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see
|
||||
return x
|
||||
|
||||
|
||||
def decode_first_stage(model, x):
|
||||
with devices.autocast(disable=x.dtype == devices.dtype_vae):
|
||||
x = model.decode_first_stage(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def get_fixed_seed(seed):
|
||||
if seed is None or seed == '' or seed == -1:
|
||||
return int(random.randrange(4294967294))
|
||||
@@ -294,6 +304,7 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments, iteration
|
||||
"Denoising strength": getattr(p, 'denoising_strength', None),
|
||||
"Eta": (None if p.sampler is None or p.sampler.eta == p.sampler.default_eta else p.sampler.eta),
|
||||
"Clip skip": None if clip_skip <= 1 else clip_skip,
|
||||
"ENSD": None if opts.eta_noise_seed_delta == 0 else opts.eta_noise_seed_delta,
|
||||
}
|
||||
|
||||
generation_params.update(p.extra_generation_params)
|
||||
@@ -398,9 +409,8 @@ def process_images(p: StableDiffusionProcessing) -> Processed:
|
||||
# use the image collected previously in sampler loop
|
||||
samples_ddim = shared.state.current_latent
|
||||
|
||||
samples_ddim = samples_ddim.to(devices.dtype)
|
||||
|
||||
x_samples_ddim = p.sd_model.decode_first_stage(samples_ddim)
|
||||
samples_ddim = samples_ddim.to(devices.dtype_vae)
|
||||
x_samples_ddim = decode_first_stage(p.sd_model, samples_ddim)
|
||||
x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
|
||||
del samples_ddim
|
||||
@@ -533,7 +543,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
|
||||
if self.scale_latent:
|
||||
samples = torch.nn.functional.interpolate(samples, size=(self.height // opt_f, self.width // opt_f), mode="bilinear")
|
||||
else:
|
||||
decoded_samples = self.sd_model.decode_first_stage(samples)
|
||||
decoded_samples = decode_first_stage(self.sd_model, samples)
|
||||
|
||||
if opts.upscaler_for_img2img is None or opts.upscaler_for_img2img == "None":
|
||||
decoded_samples = torch.nn.functional.interpolate(decoded_samples, size=(self.height, self.width), mode="bilinear")
|
||||
|
@@ -8,7 +8,7 @@ from torch import einsum
|
||||
from torch.nn.functional import silu
|
||||
|
||||
import modules.textual_inversion.textual_inversion
|
||||
from modules import prompt_parser, devices, sd_hijack_optimizations, shared, hypernetwork
|
||||
from modules import prompt_parser, devices, sd_hijack_optimizations, shared
|
||||
from modules.shared import opts, device, cmd_opts
|
||||
|
||||
import ldm.modules.attention
|
||||
@@ -23,7 +23,7 @@ def apply_optimizations():
|
||||
|
||||
ldm.modules.diffusionmodules.model.nonlinearity = silu
|
||||
|
||||
if cmd_opts.force_enable_xformers or (cmd_opts.xformers and shared.xformers_available and torch.version.cuda and torch.cuda.get_device_capability(shared.device) == (8, 6)):
|
||||
if cmd_opts.force_enable_xformers or (cmd_opts.xformers and shared.xformers_available and torch.version.cuda and (6, 0) <= torch.cuda.get_device_capability(shared.device) <= (8, 6)):
|
||||
print("Applying xformers cross attention optimization.")
|
||||
ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.xformers_attention_forward
|
||||
ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward
|
||||
@@ -37,16 +37,15 @@ def apply_optimizations():
|
||||
|
||||
|
||||
def undo_optimizations():
|
||||
from modules.hypernetwork import hypernetwork
|
||||
|
||||
ldm.modules.attention.CrossAttention.forward = hypernetwork.attention_CrossAttention_forward
|
||||
ldm.modules.diffusionmodules.model.nonlinearity = diffusionmodules_model_nonlinearity
|
||||
ldm.modules.diffusionmodules.model.AttnBlock.forward = diffusionmodules_model_AttnBlock_forward
|
||||
|
||||
|
||||
def get_target_prompt_token_count(token_count):
|
||||
if token_count < 75:
|
||||
return 75
|
||||
|
||||
return math.ceil(token_count / 10) * 10
|
||||
return math.ceil(max(token_count, 1) / 75) * 75
|
||||
|
||||
|
||||
class StableDiffusionModelHijack:
|
||||
@@ -110,6 +109,8 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
self.tokenizer = wrapped.tokenizer
|
||||
self.token_mults = {}
|
||||
|
||||
self.comma_token = [v for k, v in self.tokenizer.get_vocab().items() if k == ',</w>'][0]
|
||||
|
||||
tokens_with_parens = [(k, v) for k, v in self.tokenizer.get_vocab().items() if '(' in k or ')' in k or '[' in k or ']' in k]
|
||||
for text, ident in tokens_with_parens:
|
||||
mult = 1.0
|
||||
@@ -127,7 +128,6 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
self.token_mults[ident] = mult
|
||||
|
||||
def tokenize_line(self, line, used_custom_terms, hijack_comments):
|
||||
id_start = self.wrapped.tokenizer.bos_token_id
|
||||
id_end = self.wrapped.tokenizer.eos_token_id
|
||||
|
||||
if opts.enable_emphasis:
|
||||
@@ -140,6 +140,7 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
fixes = []
|
||||
remade_tokens = []
|
||||
multipliers = []
|
||||
last_comma = -1
|
||||
|
||||
for tokens, (text, weight) in zip(tokenized, parsed):
|
||||
i = 0
|
||||
@@ -148,13 +149,33 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
|
||||
embedding, embedding_length_in_tokens = self.hijack.embedding_db.find_embedding_at_position(tokens, i)
|
||||
|
||||
if token == self.comma_token:
|
||||
last_comma = len(remade_tokens)
|
||||
elif opts.comma_padding_backtrack != 0 and max(len(remade_tokens), 1) % 75 == 0 and last_comma != -1 and len(remade_tokens) - last_comma <= opts.comma_padding_backtrack:
|
||||
last_comma += 1
|
||||
reloc_tokens = remade_tokens[last_comma:]
|
||||
reloc_mults = multipliers[last_comma:]
|
||||
|
||||
remade_tokens = remade_tokens[:last_comma]
|
||||
length = len(remade_tokens)
|
||||
|
||||
rem = int(math.ceil(length / 75)) * 75 - length
|
||||
remade_tokens += [id_end] * rem + reloc_tokens
|
||||
multipliers = multipliers[:last_comma] + [1.0] * rem + reloc_mults
|
||||
|
||||
if embedding is None:
|
||||
remade_tokens.append(token)
|
||||
multipliers.append(weight)
|
||||
i += 1
|
||||
else:
|
||||
emb_len = int(embedding.vec.shape[0])
|
||||
fixes.append((len(remade_tokens), embedding))
|
||||
iteration = len(remade_tokens) // 75
|
||||
if (len(remade_tokens) + emb_len) // 75 != iteration:
|
||||
rem = (75 * (iteration + 1) - len(remade_tokens))
|
||||
remade_tokens += [id_end] * rem
|
||||
multipliers += [1.0] * rem
|
||||
iteration += 1
|
||||
fixes.append((iteration, (len(remade_tokens) % 75, embedding)))
|
||||
remade_tokens += [0] * emb_len
|
||||
multipliers += [weight] * emb_len
|
||||
used_custom_terms.append((embedding.name, embedding.checksum()))
|
||||
@@ -162,10 +183,10 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
|
||||
token_count = len(remade_tokens)
|
||||
prompt_target_length = get_target_prompt_token_count(token_count)
|
||||
tokens_to_add = prompt_target_length - len(remade_tokens) + 1
|
||||
tokens_to_add = prompt_target_length - len(remade_tokens)
|
||||
|
||||
remade_tokens = [id_start] + remade_tokens + [id_end] * tokens_to_add
|
||||
multipliers = [1.0] + multipliers + [1.0] * tokens_to_add
|
||||
remade_tokens = remade_tokens + [id_end] * tokens_to_add
|
||||
multipliers = multipliers + [1.0] * tokens_to_add
|
||||
|
||||
return remade_tokens, fixes, multipliers, token_count
|
||||
|
||||
@@ -260,29 +281,55 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
hijack_fixes.append(fixes)
|
||||
batch_multipliers.append(multipliers)
|
||||
return batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count
|
||||
|
||||
|
||||
def forward(self, text):
|
||||
|
||||
if opts.use_old_emphasis_implementation:
|
||||
use_old = opts.use_old_emphasis_implementation
|
||||
if use_old:
|
||||
batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count = self.process_text_old(text)
|
||||
else:
|
||||
batch_multipliers, remade_batch_tokens, used_custom_terms, hijack_comments, hijack_fixes, token_count = self.process_text(text)
|
||||
|
||||
self.hijack.fixes = hijack_fixes
|
||||
self.hijack.comments += hijack_comments
|
||||
|
||||
if len(used_custom_terms) > 0:
|
||||
self.hijack.comments.append("Used embeddings: " + ", ".join([f'{word} [{checksum}]' for word, checksum in used_custom_terms]))
|
||||
|
||||
if use_old:
|
||||
self.hijack.fixes = hijack_fixes
|
||||
return self.process_tokens(remade_batch_tokens, batch_multipliers)
|
||||
|
||||
z = None
|
||||
i = 0
|
||||
while max(map(len, remade_batch_tokens)) != 0:
|
||||
rem_tokens = [x[75:] for x in remade_batch_tokens]
|
||||
rem_multipliers = [x[75:] for x in batch_multipliers]
|
||||
|
||||
target_token_count = get_target_prompt_token_count(token_count) + 2
|
||||
self.hijack.fixes = []
|
||||
for unfiltered in hijack_fixes:
|
||||
fixes = []
|
||||
for fix in unfiltered:
|
||||
if fix[0] == i:
|
||||
fixes.append(fix[1])
|
||||
self.hijack.fixes.append(fixes)
|
||||
|
||||
z1 = self.process_tokens([x[:75] for x in remade_batch_tokens], [x[:75] for x in batch_multipliers])
|
||||
z = z1 if z is None else torch.cat((z, z1), axis=-2)
|
||||
|
||||
remade_batch_tokens = rem_tokens
|
||||
batch_multipliers = rem_multipliers
|
||||
i += 1
|
||||
|
||||
return z
|
||||
|
||||
|
||||
def process_tokens(self, remade_batch_tokens, batch_multipliers):
|
||||
if not opts.use_old_emphasis_implementation:
|
||||
remade_batch_tokens = [[self.wrapped.tokenizer.bos_token_id] + x[:75] + [self.wrapped.tokenizer.eos_token_id] for x in remade_batch_tokens]
|
||||
batch_multipliers = [[1.0] + x[:75] + [1.0] for x in batch_multipliers]
|
||||
|
||||
tokens = torch.asarray(remade_batch_tokens).to(device)
|
||||
outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=-opts.CLIP_stop_at_last_layers)
|
||||
|
||||
position_ids_array = [min(x, 75) for x in range(target_token_count-1)] + [76]
|
||||
position_ids = torch.asarray(position_ids_array, device=devices.device).expand((1, -1))
|
||||
|
||||
remade_batch_tokens_of_same_length = [x + [self.wrapped.tokenizer.eos_token_id] * (target_token_count - len(x)) for x in remade_batch_tokens]
|
||||
tokens = torch.asarray(remade_batch_tokens_of_same_length).to(device)
|
||||
|
||||
outputs = self.wrapped.transformer(input_ids=tokens, position_ids=position_ids, output_hidden_states=-opts.CLIP_stop_at_last_layers)
|
||||
if opts.CLIP_stop_at_last_layers > 1:
|
||||
z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers]
|
||||
z = self.wrapped.transformer.text_model.final_layer_norm(z)
|
||||
@@ -290,7 +337,7 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
|
||||
z = outputs.last_hidden_state
|
||||
|
||||
# restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
|
||||
batch_multipliers_of_same_length = [x + [1.0] * (target_token_count - len(x)) for x in batch_multipliers]
|
||||
batch_multipliers_of_same_length = [x + [1.0] * (75 - len(x)) for x in batch_multipliers]
|
||||
batch_multipliers = torch.asarray(batch_multipliers_of_same_length).to(device)
|
||||
original_mean = z.mean()
|
||||
z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape)
|
||||
|
@@ -9,12 +9,12 @@ from ldm.util import default
|
||||
from einops import rearrange
|
||||
|
||||
from modules import shared
|
||||
from modules.hypernetwork import hypernetwork
|
||||
|
||||
|
||||
if shared.cmd_opts.xformers or shared.cmd_opts.force_enable_xformers:
|
||||
try:
|
||||
import xformers.ops
|
||||
import functorch
|
||||
xformers._is_functorch_available = True
|
||||
shared.xformers_available = True
|
||||
except Exception:
|
||||
print("Cannot import xformers", file=sys.stderr)
|
||||
@@ -28,16 +28,10 @@ def split_cross_attention_forward_v1(self, x, context=None, mask=None):
|
||||
q_in = self.to_q(x)
|
||||
context = default(context, x)
|
||||
|
||||
hypernetwork = shared.loaded_hypernetwork
|
||||
hypernetwork_layers = (hypernetwork.layers if hypernetwork is not None else {}).get(context.shape[2], None)
|
||||
|
||||
if hypernetwork_layers is not None:
|
||||
k_in = self.to_k(hypernetwork_layers[0](context))
|
||||
v_in = self.to_v(hypernetwork_layers[1](context))
|
||||
else:
|
||||
k_in = self.to_k(context)
|
||||
v_in = self.to_v(context)
|
||||
del context, x
|
||||
context_k, context_v = hypernetwork.apply_hypernetwork(shared.loaded_hypernetwork, context)
|
||||
k_in = self.to_k(context_k)
|
||||
v_in = self.to_v(context_v)
|
||||
del context, context_k, context_v, x
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
|
||||
del q_in, k_in, v_in
|
||||
@@ -61,22 +55,16 @@ def split_cross_attention_forward_v1(self, x, context=None, mask=None):
|
||||
return self.to_out(r2)
|
||||
|
||||
|
||||
# taken from https://github.com/Doggettx/stable-diffusion
|
||||
# taken from https://github.com/Doggettx/stable-diffusion and modified
|
||||
def split_cross_attention_forward(self, x, context=None, mask=None):
|
||||
h = self.heads
|
||||
|
||||
q_in = self.to_q(x)
|
||||
context = default(context, x)
|
||||
|
||||
hypernetwork = shared.loaded_hypernetwork
|
||||
hypernetwork_layers = (hypernetwork.layers if hypernetwork is not None else {}).get(context.shape[2], None)
|
||||
|
||||
if hypernetwork_layers is not None:
|
||||
k_in = self.to_k(hypernetwork_layers[0](context))
|
||||
v_in = self.to_v(hypernetwork_layers[1](context))
|
||||
else:
|
||||
k_in = self.to_k(context)
|
||||
v_in = self.to_v(context)
|
||||
context_k, context_v = hypernetwork.apply_hypernetwork(shared.loaded_hypernetwork, context)
|
||||
k_in = self.to_k(context_k)
|
||||
v_in = self.to_v(context_v)
|
||||
|
||||
k_in *= self.scale
|
||||
|
||||
@@ -132,14 +120,11 @@ def xformers_attention_forward(self, x, context=None, mask=None):
|
||||
h = self.heads
|
||||
q_in = self.to_q(x)
|
||||
context = default(context, x)
|
||||
hypernetwork = shared.loaded_hypernetwork
|
||||
hypernetwork_layers = (hypernetwork.layers if hypernetwork is not None else {}).get(context.shape[2], None)
|
||||
if hypernetwork_layers is not None:
|
||||
k_in = self.to_k(hypernetwork_layers[0](context))
|
||||
v_in = self.to_v(hypernetwork_layers[1](context))
|
||||
else:
|
||||
k_in = self.to_k(context)
|
||||
v_in = self.to_v(context)
|
||||
|
||||
context_k, context_v = hypernetwork.apply_hypernetwork(shared.loaded_hypernetwork, context)
|
||||
k_in = self.to_k(context_k)
|
||||
v_in = self.to_v(context_v)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b n h d', h=h), (q_in, k_in, v_in))
|
||||
del q_in, k_in, v_in
|
||||
out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None)
|
||||
|
@@ -149,8 +149,13 @@ def load_model_weights(model, checkpoint_info):
|
||||
model.half()
|
||||
|
||||
devices.dtype = torch.float32 if shared.cmd_opts.no_half else torch.float16
|
||||
devices.dtype_vae = torch.float32 if shared.cmd_opts.no_half or shared.cmd_opts.no_half_vae else torch.float16
|
||||
|
||||
vae_file = os.path.splitext(checkpoint_file)[0] + ".vae.pt"
|
||||
|
||||
if not os.path.exists(vae_file) and shared.cmd_opts.vae_path is not None:
|
||||
vae_file = shared.cmd_opts.vae_path
|
||||
|
||||
if os.path.exists(vae_file):
|
||||
print(f"Loading VAE weights from: {vae_file}")
|
||||
vae_ckpt = torch.load(vae_file, map_location="cpu")
|
||||
@@ -158,6 +163,8 @@ def load_model_weights(model, checkpoint_info):
|
||||
|
||||
model.first_stage_model.load_state_dict(vae_dict)
|
||||
|
||||
model.first_stage_model.to(devices.dtype_vae)
|
||||
|
||||
model.sd_model_hash = sd_model_hash
|
||||
model.sd_model_checkpoint = checkpoint_file
|
||||
model.sd_checkpoint_info = checkpoint_info
|
||||
|
@@ -7,7 +7,7 @@ import inspect
|
||||
import k_diffusion.sampling
|
||||
import ldm.models.diffusion.ddim
|
||||
import ldm.models.diffusion.plms
|
||||
from modules import prompt_parser
|
||||
from modules import prompt_parser, devices, processing
|
||||
|
||||
from modules.shared import opts, cmd_opts, state
|
||||
import modules.shared as shared
|
||||
@@ -83,7 +83,7 @@ def setup_img2img_steps(p, steps=None):
|
||||
|
||||
|
||||
def sample_to_image(samples):
|
||||
x_sample = shared.sd_model.decode_first_stage(samples[0:1].type(shared.sd_model.dtype))[0]
|
||||
x_sample = processing.decode_first_stage(shared.sd_model, samples[0:1])[0]
|
||||
x_sample = torch.clamp((x_sample + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
x_sample = 255. * np.moveaxis(x_sample.cpu().numpy(), 0, 2)
|
||||
x_sample = x_sample.astype(np.uint8)
|
||||
|
@@ -13,7 +13,8 @@ import modules.memmon
|
||||
import modules.sd_models
|
||||
import modules.styles
|
||||
import modules.devices as devices
|
||||
from modules import sd_samplers, hypernetwork
|
||||
from modules import sd_samplers
|
||||
from modules.hypernetwork import hypernetwork
|
||||
from modules.paths import models_path, script_path, sd_path
|
||||
|
||||
sd_model_file = os.path.join(script_path, 'model.ckpt')
|
||||
@@ -25,9 +26,11 @@ parser.add_argument("--ckpt-dir", type=str, default=None, help="Path to director
|
||||
parser.add_argument("--gfpgan-dir", type=str, help="GFPGAN directory", default=('./src/gfpgan' if os.path.exists('./src/gfpgan') else './GFPGAN'))
|
||||
parser.add_argument("--gfpgan-model", type=str, help="GFPGAN model file name", default=None)
|
||||
parser.add_argument("--no-half", action='store_true', help="do not switch the model to 16-bit floats")
|
||||
parser.add_argument("--no-half-vae", action='store_true', help="do not switch the VAE model to 16-bit floats")
|
||||
parser.add_argument("--no-progressbar-hiding", action='store_true', help="do not hide progressbar in gradio UI (we hide it because it slows down ML if you have hardware acceleration in browser)")
|
||||
parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI")
|
||||
parser.add_argument("--embeddings-dir", type=str, default=os.path.join(script_path, 'embeddings'), help="embeddings directory for textual inversion (default: embeddings)")
|
||||
parser.add_argument("--hypernetwork-dir", type=str, default=os.path.join(models_path, 'hypernetworks'), help="hypernetwork directory")
|
||||
parser.add_argument("--allow-code", action='store_true', help="allow custom script execution from webui")
|
||||
parser.add_argument("--medvram", action='store_true', help="enable stable diffusion model optimizations for sacrificing a little speed for low VRM usage")
|
||||
parser.add_argument("--lowvram", action='store_true', help="enable stable diffusion model optimizations for sacrificing a lot of speed for very low VRM usage")
|
||||
@@ -65,6 +68,7 @@ parser.add_argument("--autolaunch", action='store_true', help="open the webui UR
|
||||
parser.add_argument("--use-textbox-seed", action='store_true', help="use textbox for seeds in UI (no up/down, but possible to input long seeds)", default=False)
|
||||
parser.add_argument("--disable-console-progressbars", action='store_true', help="do not output progressbars to console", default=False)
|
||||
parser.add_argument("--enable-console-prompts", action='store_true', help="print prompts to console when generating with txt2img and img2img", default=False)
|
||||
parser.add_argument('--vae-path', type=str, help='Path to Variational Autoencoders model', default=None)
|
||||
parser.add_argument("--disable-safe-unpickle", action='store_true', help="disable checking pytorch models for malicious code", default=False)
|
||||
|
||||
|
||||
@@ -80,10 +84,17 @@ parallel_processing_allowed = not cmd_opts.lowvram and not cmd_opts.medvram
|
||||
xformers_available = False
|
||||
config_filename = cmd_opts.ui_settings_file
|
||||
|
||||
hypernetworks = hypernetwork.list_hypernetworks(os.path.join(models_path, 'hypernetworks'))
|
||||
hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
|
||||
loaded_hypernetwork = None
|
||||
|
||||
|
||||
def reload_hypernetworks():
|
||||
global hypernetworks
|
||||
|
||||
hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
|
||||
hypernetwork.load_hypernetwork(opts.sd_hypernetwork)
|
||||
|
||||
|
||||
class State:
|
||||
skipped = False
|
||||
interrupted = False
|
||||
@@ -171,6 +182,7 @@ options_templates.update(options_section(('saving-images', "Saving images/grids"
|
||||
|
||||
"use_original_name_batch": OptionInfo(False, "Use original name for output filename during batch process in extras tab"),
|
||||
"save_selected_only": OptionInfo(True, "When using 'Save' button, only save a single selected image"),
|
||||
"do_not_add_watermark": OptionInfo(False, "Do not add watermark to images"),
|
||||
}))
|
||||
|
||||
options_templates.update(options_section(('saving-paths', "Paths for saving"), {
|
||||
@@ -224,6 +236,7 @@ options_templates.update(options_section(('sd', "Stable Diffusion"), {
|
||||
"enable_emphasis": OptionInfo(True, "Emphasis: use (text) to make model pay more attention to text and [text] to make it pay less attention"),
|
||||
"use_old_emphasis_implementation": OptionInfo(False, "Use old emphasis implementation. Can be useful to reproduce old seeds."),
|
||||
"enable_batch_seeds": OptionInfo(True, "Make K-diffusion samplers produce same images in a batch as when making a single image"),
|
||||
"comma_padding_backtrack": OptionInfo(20, "Increase coherency by padding from the last comma within n tokens when using more than 75 tokens", gr.Slider, {"minimum": 0, "maximum": 74, "step": 1 }),
|
||||
"filter_nsfw": OptionInfo(False, "Filter NSFW content"),
|
||||
'CLIP_stop_at_last_layers': OptionInfo(1, "Stop At last layers of CLIP model", gr.Slider, {"minimum": 1, "maximum": 12, "step": 1}),
|
||||
"random_artist_categories": OptionInfo([], "Allowed categories for random artists selection when using the Roll button", gr.CheckboxGroup, {"choices": artist_db.categories()}),
|
||||
@@ -236,6 +249,7 @@ options_templates.update(options_section(('interrogate', "Interrogate Options"),
|
||||
"interrogate_clip_min_length": OptionInfo(24, "Interrogate: minimum description length (excluding artists, etc..)", gr.Slider, {"minimum": 1, "maximum": 128, "step": 1}),
|
||||
"interrogate_clip_max_length": OptionInfo(48, "Interrogate: maximum description length", gr.Slider, {"minimum": 1, "maximum": 256, "step": 1}),
|
||||
"interrogate_clip_dict_limit": OptionInfo(1500, "Interrogate: maximum number of lines in text file (0 = No limit)"),
|
||||
"interrogate_deepbooru_score_threshold": OptionInfo(0.5, "Interrogate: deepbooru score threshold", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01}),
|
||||
}))
|
||||
|
||||
options_templates.update(options_section(('ui', "User interface"), {
|
||||
@@ -259,6 +273,7 @@ options_templates.update(options_section(('sampler-params', "Sampler parameters"
|
||||
's_churn': OptionInfo(0.0, "sigma churn", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
|
||||
's_tmin': OptionInfo(0.0, "sigma tmin", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
|
||||
's_noise': OptionInfo(1.0, "sigma noise", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
|
||||
'eta_noise_seed_delta': OptionInfo(0, "Eta noise seed delta", gr.Number, {"precision": 0}),
|
||||
}))
|
||||
|
||||
|
||||
|
@@ -10,6 +10,7 @@ from tqdm import tqdm
|
||||
from modules import modelloader
|
||||
from modules.shared import cmd_opts, opts, device
|
||||
from modules.swinir_model_arch import SwinIR as net
|
||||
from modules.swinir_model_arch_v2 import Swin2SR as net2
|
||||
from modules.upscaler import Upscaler, UpscalerData
|
||||
|
||||
precision_scope = (
|
||||
@@ -57,22 +58,42 @@ class UpscalerSwinIR(Upscaler):
|
||||
filename = path
|
||||
if filename is None or not os.path.exists(filename):
|
||||
return None
|
||||
model = net(
|
||||
if filename.endswith(".v2.pth"):
|
||||
model = net2(
|
||||
upscale=scale,
|
||||
in_chans=3,
|
||||
img_size=64,
|
||||
window_size=8,
|
||||
img_range=1.0,
|
||||
depths=[6, 6, 6, 6, 6, 6, 6, 6, 6],
|
||||
embed_dim=240,
|
||||
num_heads=[8, 8, 8, 8, 8, 8, 8, 8, 8],
|
||||
depths=[6, 6, 6, 6, 6, 6],
|
||||
embed_dim=180,
|
||||
num_heads=[6, 6, 6, 6, 6, 6],
|
||||
mlp_ratio=2,
|
||||
upsampler="nearest+conv",
|
||||
resi_connection="3conv",
|
||||
)
|
||||
resi_connection="1conv",
|
||||
)
|
||||
params = None
|
||||
else:
|
||||
model = net(
|
||||
upscale=scale,
|
||||
in_chans=3,
|
||||
img_size=64,
|
||||
window_size=8,
|
||||
img_range=1.0,
|
||||
depths=[6, 6, 6, 6, 6, 6, 6, 6, 6],
|
||||
embed_dim=240,
|
||||
num_heads=[8, 8, 8, 8, 8, 8, 8, 8, 8],
|
||||
mlp_ratio=2,
|
||||
upsampler="nearest+conv",
|
||||
resi_connection="3conv",
|
||||
)
|
||||
params = "params_ema"
|
||||
|
||||
pretrained_model = torch.load(filename)
|
||||
model.load_state_dict(pretrained_model["params_ema"], strict=True)
|
||||
if params is not None:
|
||||
model.load_state_dict(pretrained_model[params], strict=True)
|
||||
else:
|
||||
model.load_state_dict(pretrained_model, strict=True)
|
||||
if not cmd_opts.no_half:
|
||||
model = model.half()
|
||||
return model
|
||||
|
1017
modules/swinir_model_arch_v2.py
Normal file
1017
modules/swinir_model_arch_v2.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -15,11 +15,10 @@ re_tag = re.compile(r"[a-zA-Z][_\w\d()]+")
|
||||
|
||||
|
||||
class PersonalizedBase(Dataset):
|
||||
def __init__(self, data_root, size=None, repeats=100, flip_p=0.5, placeholder_token="*", width=512, height=512, model=None, device=None, template_file=None):
|
||||
def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_token="*", model=None, device=None, template_file=None):
|
||||
|
||||
self.placeholder_token = placeholder_token
|
||||
|
||||
self.size = size
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
||||
|
@@ -7,8 +7,9 @@ import tqdm
|
||||
from modules import shared, images
|
||||
|
||||
|
||||
def preprocess(process_src, process_dst, process_flip, process_split, process_caption):
|
||||
size = 512
|
||||
def preprocess(process_src, process_dst, process_width, process_height, process_flip, process_split, process_caption):
|
||||
width = process_width
|
||||
height = process_height
|
||||
src = os.path.abspath(process_src)
|
||||
dst = os.path.abspath(process_dst)
|
||||
|
||||
@@ -55,23 +56,23 @@ def preprocess(process_src, process_dst, process_flip, process_split, process_ca
|
||||
is_wide = ratio < 1 / 1.35
|
||||
|
||||
if process_split and is_tall:
|
||||
img = img.resize((size, size * img.height // img.width))
|
||||
img = img.resize((width, height * img.height // img.width))
|
||||
|
||||
top = img.crop((0, 0, size, size))
|
||||
top = img.crop((0, 0, width, height))
|
||||
save_pic(top, index)
|
||||
|
||||
bot = img.crop((0, img.height - size, size, img.height))
|
||||
bot = img.crop((0, img.height - height, width, img.height))
|
||||
save_pic(bot, index)
|
||||
elif process_split and is_wide:
|
||||
img = img.resize((size * img.width // img.height, size))
|
||||
img = img.resize((width * img.width // img.height, height))
|
||||
|
||||
left = img.crop((0, 0, size, size))
|
||||
left = img.crop((0, 0, width, height))
|
||||
save_pic(left, index)
|
||||
|
||||
right = img.crop((img.width - size, 0, img.width, size))
|
||||
right = img.crop((img.width - width, 0, img.width, height))
|
||||
save_pic(right, index)
|
||||
else:
|
||||
img = images.resize_image(1, img, size, size)
|
||||
img = images.resize_image(1, img, width, height)
|
||||
save_pic(img, index)
|
||||
|
||||
shared.state.nextjob()
|
||||
|
@@ -156,7 +156,7 @@ def create_embedding(name, num_vectors_per_token, init_text='*'):
|
||||
return fn
|
||||
|
||||
|
||||
def train_embedding(embedding_name, learn_rate, data_root, log_directory, steps, create_image_every, save_embedding_every, template_file):
|
||||
def train_embedding(embedding_name, learn_rate, data_root, log_directory, training_width, training_height, steps, num_repeats, create_image_every, save_embedding_every, template_file, preview_image_prompt):
|
||||
assert embedding_name, 'embedding not selected'
|
||||
|
||||
shared.state.textinfo = "Initializing textual inversion training..."
|
||||
@@ -182,7 +182,7 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, steps,
|
||||
|
||||
shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
|
||||
with torch.autocast("cuda"):
|
||||
ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, size=512, placeholder_token=embedding_name, model=shared.sd_model, device=devices.device, template_file=template_file)
|
||||
ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=num_repeats, placeholder_token=embedding_name, model=shared.sd_model, device=devices.device, template_file=template_file)
|
||||
|
||||
hijack = sd_hijack.model_hijack
|
||||
|
||||
@@ -200,6 +200,9 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, steps,
|
||||
if ititial_step > steps:
|
||||
return embedding, filename
|
||||
|
||||
tr_img_len = len([os.path.join(data_root, file_path) for file_path in os.listdir(data_root)])
|
||||
epoch_len = (tr_img_len * num_repeats) + tr_img_len
|
||||
|
||||
pbar = tqdm.tqdm(enumerate(ds), total=steps-ititial_step)
|
||||
for i, (x, text) in pbar:
|
||||
embedding.step = i + ititial_step
|
||||
@@ -223,7 +226,10 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, steps,
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
pbar.set_description(f"loss: {losses.mean():.7f}")
|
||||
epoch_num = embedding.step // epoch_len
|
||||
epoch_step = embedding.step - (epoch_num * epoch_len) + 1
|
||||
|
||||
pbar.set_description(f"[Epoch {epoch_num}: {epoch_step}/{epoch_len}]loss: {losses.mean():.7f}")
|
||||
|
||||
if embedding.step > 0 and embedding_dir is not None and embedding.step % save_embedding_every == 0:
|
||||
last_saved_file = os.path.join(embedding_dir, f'{embedding_name}-{embedding.step}.pt')
|
||||
@@ -232,10 +238,14 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, steps,
|
||||
if embedding.step > 0 and images_dir is not None and embedding.step % create_image_every == 0:
|
||||
last_saved_image = os.path.join(images_dir, f'{embedding_name}-{embedding.step}.png')
|
||||
|
||||
preview_text = text if preview_image_prompt == "" else preview_image_prompt
|
||||
|
||||
p = processing.StableDiffusionProcessingTxt2Img(
|
||||
sd_model=shared.sd_model,
|
||||
prompt=text,
|
||||
prompt=preview_text,
|
||||
steps=20,
|
||||
height=training_height,
|
||||
width=training_width,
|
||||
do_not_save_grid=True,
|
||||
do_not_save_samples=True,
|
||||
)
|
||||
@@ -246,7 +256,7 @@ def train_embedding(embedding_name, learn_rate, data_root, log_directory, steps,
|
||||
shared.state.current_image = image
|
||||
image.save(last_saved_image)
|
||||
|
||||
last_saved_image += f", prompt: {text}"
|
||||
last_saved_image += f", prompt: {preview_text}"
|
||||
|
||||
shared.state.job_no = embedding.step
|
||||
|
||||
|
@@ -22,7 +22,6 @@ def preprocess(*args):
|
||||
|
||||
|
||||
def train_embedding(*args):
|
||||
|
||||
try:
|
||||
sd_hijack.undo_optimizations()
|
||||
|
||||
|
@@ -39,6 +39,7 @@ import modules.generation_parameters_copypaste
|
||||
from modules import prompt_parser
|
||||
from modules.images import save_image
|
||||
import modules.textual_inversion.ui
|
||||
import modules.hypernetwork.ui
|
||||
import modules.images_history as img_his
|
||||
|
||||
# this is a fix for Windows users. Without it, javascript files will be served with text/html content-type and the browser will not show any UI
|
||||
@@ -312,7 +313,7 @@ def interrogate(image):
|
||||
|
||||
|
||||
def interrogate_deepbooru(image):
|
||||
prompt = get_deepbooru_tags(image)
|
||||
prompt = get_deepbooru_tags(image, opts.interrogate_deepbooru_score_threshold)
|
||||
return gr_show(True) if prompt is None else prompt
|
||||
|
||||
|
||||
@@ -525,7 +526,7 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
denoising_strength = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label='Denoising strength', value=0.7)
|
||||
|
||||
with gr.Row():
|
||||
batch_count = gr.Slider(minimum=1, maximum=cmd_opts.max_batch_count, step=1, label='Batch count', value=1)
|
||||
batch_count = gr.Slider(minimum=1, step=1, label='Batch count', value=1)
|
||||
batch_size = gr.Slider(minimum=1, maximum=8, step=1, label='Batch size', value=1)
|
||||
|
||||
cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=7.0)
|
||||
@@ -712,7 +713,7 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
tiling = gr.Checkbox(label='Tiling', value=False)
|
||||
|
||||
with gr.Row():
|
||||
batch_count = gr.Slider(minimum=1, maximum=cmd_opts.max_batch_count, step=1, label='Batch count', value=1)
|
||||
batch_count = gr.Slider(minimum=1, step=1, label='Batch count', value=1)
|
||||
batch_size = gr.Slider(minimum=1, maximum=8, step=1, label='Batch size', value=1)
|
||||
|
||||
with gr.Group():
|
||||
@@ -964,7 +965,7 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
|
||||
extras_send_to_inpaint.click(
|
||||
fn=lambda x: image_from_url_text(x),
|
||||
_js="extract_image_from_gallery_img2img",
|
||||
_js="extract_image_from_gallery_inpaint",
|
||||
inputs=[result_images],
|
||||
outputs=[init_img_with_mask],
|
||||
)
|
||||
@@ -1025,13 +1026,27 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
gr.HTML(value="")
|
||||
|
||||
with gr.Column():
|
||||
create_embedding = gr.Button(value="Create", variant='primary')
|
||||
create_embedding = gr.Button(value="Create embedding", variant='primary')
|
||||
|
||||
with gr.Group():
|
||||
gr.HTML(value="<p style='margin-bottom: 0.7em'>Create a new hypernetwork</p>")
|
||||
|
||||
new_hypernetwork_name = gr.Textbox(label="Name")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column(scale=3):
|
||||
gr.HTML(value="")
|
||||
|
||||
with gr.Column():
|
||||
create_hypernetwork = gr.Button(value="Create hypernetwork", variant='primary')
|
||||
|
||||
with gr.Group():
|
||||
gr.HTML(value="<p style='margin-bottom: 0.7em'>Preprocess images</p>")
|
||||
|
||||
process_src = gr.Textbox(label='Source directory')
|
||||
process_dst = gr.Textbox(label='Destination directory')
|
||||
process_width = gr.Slider(minimum=64, maximum=2048, step=64, label="Width", value=512)
|
||||
process_height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512)
|
||||
|
||||
with gr.Row():
|
||||
process_flip = gr.Checkbox(label='Create flipped copies')
|
||||
@@ -1046,24 +1061,25 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
run_preprocess = gr.Button(value="Preprocess", variant='primary')
|
||||
|
||||
with gr.Group():
|
||||
gr.HTML(value="<p style='margin-bottom: 0.7em'>Train an embedding; must specify a directory with a set of 512x512 images</p>")
|
||||
gr.HTML(value="<p style='margin-bottom: 0.7em'>Train an embedding; must specify a directory with a set of 1:1 ratio images</p>")
|
||||
train_embedding_name = gr.Dropdown(label='Embedding', choices=sorted(sd_hijack.model_hijack.embedding_db.word_embeddings.keys()))
|
||||
train_hypernetwork_name = gr.Dropdown(label='Hypernetwork', choices=[x for x in shared.hypernetworks.keys()])
|
||||
learn_rate = gr.Number(label='Learning rate', value=5.0e-03)
|
||||
dataset_directory = gr.Textbox(label='Dataset directory', placeholder="Path to directory with input images")
|
||||
log_directory = gr.Textbox(label='Log directory', placeholder="Path to directory where to write outputs", value="textual_inversion")
|
||||
template_file = gr.Textbox(label='Prompt template file', value=os.path.join(script_path, "textual_inversion_templates", "style_filewords.txt"))
|
||||
training_width = gr.Slider(minimum=64, maximum=2048, step=64, label="Width", value=512)
|
||||
training_height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512)
|
||||
steps = gr.Number(label='Max steps', value=100000, precision=0)
|
||||
num_repeats = gr.Number(label='Number of repeats for a single input image per epoch', value=100, precision=0)
|
||||
create_image_every = gr.Number(label='Save an image to log directory every N steps, 0 to disable', value=500, precision=0)
|
||||
save_embedding_every = gr.Number(label='Save a copy of embedding to log directory every N steps, 0 to disable', value=500, precision=0)
|
||||
preview_image_prompt = gr.Textbox(label='Preview prompt', value="")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column(scale=2):
|
||||
gr.HTML(value="")
|
||||
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
interrupt_training = gr.Button(value="Interrupt")
|
||||
train_embedding = gr.Button(value="Train", variant='primary')
|
||||
interrupt_training = gr.Button(value="Interrupt")
|
||||
train_hypernetwork = gr.Button(value="Train Hypernetwork", variant='primary')
|
||||
train_embedding = gr.Button(value="Train Embedding", variant='primary')
|
||||
|
||||
with gr.Column():
|
||||
progressbar = gr.HTML(elem_id="ti_progressbar")
|
||||
@@ -1089,12 +1105,26 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
]
|
||||
)
|
||||
|
||||
create_hypernetwork.click(
|
||||
fn=modules.hypernetwork.ui.create_hypernetwork,
|
||||
inputs=[
|
||||
new_hypernetwork_name,
|
||||
],
|
||||
outputs=[
|
||||
train_hypernetwork_name,
|
||||
ti_output,
|
||||
ti_outcome,
|
||||
]
|
||||
)
|
||||
|
||||
run_preprocess.click(
|
||||
fn=wrap_gradio_gpu_call(modules.textual_inversion.ui.preprocess, extra_outputs=[gr.update()]),
|
||||
_js="start_training_textual_inversion",
|
||||
inputs=[
|
||||
process_src,
|
||||
process_dst,
|
||||
process_width,
|
||||
process_height,
|
||||
process_flip,
|
||||
process_split,
|
||||
process_caption,
|
||||
@@ -1113,10 +1143,34 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
learn_rate,
|
||||
dataset_directory,
|
||||
log_directory,
|
||||
training_width,
|
||||
training_height,
|
||||
steps,
|
||||
num_repeats,
|
||||
create_image_every,
|
||||
save_embedding_every,
|
||||
template_file,
|
||||
preview_image_prompt,
|
||||
],
|
||||
outputs=[
|
||||
ti_output,
|
||||
ti_outcome,
|
||||
]
|
||||
)
|
||||
|
||||
train_hypernetwork.click(
|
||||
fn=wrap_gradio_gpu_call(modules.hypernetwork.ui.train_hypernetwork, extra_outputs=[gr.update()]),
|
||||
_js="start_training_textual_inversion",
|
||||
inputs=[
|
||||
train_hypernetwork_name,
|
||||
learn_rate,
|
||||
dataset_directory,
|
||||
log_directory,
|
||||
steps,
|
||||
create_image_every,
|
||||
save_embedding_every,
|
||||
template_file,
|
||||
preview_image_prompt,
|
||||
],
|
||||
outputs=[
|
||||
ti_output,
|
||||
@@ -1130,6 +1184,7 @@ def create_ui(wrap_gradio_gpu_call):
|
||||
outputs=[],
|
||||
)
|
||||
|
||||
|
||||
def create_setting_component(key):
|
||||
def fun():
|
||||
return opts.data[key] if key in opts.data else opts.data_labels[key].default
|
||||
@@ -1335,7 +1390,7 @@ Requested path was: {f}
|
||||
|
||||
with gr.Tabs() as tabs:
|
||||
for interface, label, ifid in interfaces:
|
||||
with gr.TabItem(label, id=ifid):
|
||||
with gr.TabItem(label, id=ifid, elem_id='tab_' + ifid):
|
||||
interface.render()
|
||||
|
||||
if os.path.exists(os.path.join(script_path, "notification.mp3")):
|
||||
|
Reference in New Issue
Block a user