Merge branch 'master' into tensorboard

2025-08-08 13:19:54 +00:00 · 2023-01-13 14:57:38 +03:00
parent 18f86e41f6 544e7a233e
commit 9cd7716753
148 changed files with 13968 additions and 3805 deletions
--- a/modules/textual_inversion/autocrop.py
+++ b/modules/textual_inversion/autocrop.py
@@ -0,0 +1,341 @@
+import cv2
+import requests
+import os
+from collections import defaultdict
+from math import log, sqrt
+import numpy as np
+from PIL import Image, ImageDraw
+
+GREEN = "#0F0"
+BLUE = "#00F"
+RED = "#F00"
+
+
+def crop_image(im, settings):
+  """ Intelligently crop an image to the subject matter """
+
+  scale_by = 1
+  if is_landscape(im.width, im.height):
+    scale_by = settings.crop_height / im.height
+  elif is_portrait(im.width, im.height):
+    scale_by = settings.crop_width / im.width
+  elif is_square(im.width, im.height):
+    if is_square(settings.crop_width, settings.crop_height):
+      scale_by = settings.crop_width / im.width
+    elif is_landscape(settings.crop_width, settings.crop_height):
+      scale_by = settings.crop_width / im.width
+    elif is_portrait(settings.crop_width, settings.crop_height):
+      scale_by = settings.crop_height / im.height
+
+  im = im.resize((int(im.width * scale_by), int(im.height * scale_by)))
+  im_debug = im.copy()
+
+  focus = focal_point(im_debug, settings)
+
+  # take the focal point and turn it into crop coordinates that try to center over the focal
+  # point but then get adjusted back into the frame
+  y_half = int(settings.crop_height / 2)
+  x_half = int(settings.crop_width / 2)
+
+  x1 = focus.x - x_half
+  if x1 < 0:
+      x1 = 0
+  elif x1 + settings.crop_width > im.width:
+      x1 = im.width - settings.crop_width
+
+  y1 = focus.y - y_half
+  if y1 < 0:
+      y1 = 0
+  elif y1 + settings.crop_height > im.height:
+      y1 = im.height - settings.crop_height
+
+  x2 = x1 + settings.crop_width
+  y2 = y1 + settings.crop_height
+
+  crop = [x1, y1, x2, y2]
+
+  results = []
+
+  results.append(im.crop(tuple(crop)))
+
+  if settings.annotate_image:
+    d = ImageDraw.Draw(im_debug)
+    rect = list(crop)
+    rect[2] -= 1
+    rect[3] -= 1
+    d.rectangle(rect, outline=GREEN)
+    results.append(im_debug)
+    if settings.destop_view_image:
+      im_debug.show()
+
+  return results
+
+def focal_point(im, settings):
+    corner_points = image_corner_points(im, settings) if settings.corner_points_weight > 0 else []
+    entropy_points = image_entropy_points(im, settings) if settings.entropy_points_weight > 0 else []
+    face_points = image_face_points(im, settings) if settings.face_points_weight > 0 else []
+
+    pois = []
+
+    weight_pref_total = 0
+    if len(corner_points) > 0:
+      weight_pref_total += settings.corner_points_weight
+    if len(entropy_points) > 0:
+      weight_pref_total += settings.entropy_points_weight
+    if len(face_points) > 0:
+      weight_pref_total += settings.face_points_weight
+
+    corner_centroid = None
+    if len(corner_points) > 0:
+      corner_centroid = centroid(corner_points)
+      corner_centroid.weight = settings.corner_points_weight / weight_pref_total 
+      pois.append(corner_centroid)
+
+    entropy_centroid = None
+    if len(entropy_points) > 0:
+      entropy_centroid = centroid(entropy_points)
+      entropy_centroid.weight = settings.entropy_points_weight / weight_pref_total
+      pois.append(entropy_centroid)
+
+    face_centroid = None
+    if len(face_points) > 0:
+      face_centroid = centroid(face_points)
+      face_centroid.weight = settings.face_points_weight / weight_pref_total 
+      pois.append(face_centroid)
+
+    average_point = poi_average(pois, settings)
+
+    if settings.annotate_image:
+      d = ImageDraw.Draw(im)
+      max_size = min(im.width, im.height) * 0.07
+      if corner_centroid is not None:
+        color = BLUE
+        box = corner_centroid.bounding(max_size * corner_centroid.weight)
+        d.text((box[0], box[1]-15), "Edge: %.02f" % corner_centroid.weight, fill=color)
+        d.ellipse(box, outline=color)
+        if len(corner_points) > 1:
+          for f in corner_points:
+            d.rectangle(f.bounding(4), outline=color)
+      if entropy_centroid is not None:
+        color = "#ff0"
+        box = entropy_centroid.bounding(max_size * entropy_centroid.weight)
+        d.text((box[0], box[1]-15), "Entropy: %.02f" % entropy_centroid.weight, fill=color)
+        d.ellipse(box, outline=color)
+        if len(entropy_points) > 1:
+          for f in entropy_points:
+            d.rectangle(f.bounding(4), outline=color)
+      if face_centroid is not None:
+        color = RED
+        box = face_centroid.bounding(max_size * face_centroid.weight)
+        d.text((box[0], box[1]-15), "Face: %.02f" % face_centroid.weight, fill=color)
+        d.ellipse(box, outline=color)
+        if len(face_points) > 1:
+          for f in face_points:
+            d.rectangle(f.bounding(4), outline=color)
+
+      d.ellipse(average_point.bounding(max_size), outline=GREEN)
+      
+    return average_point
+
+
+def image_face_points(im, settings):
+    if settings.dnn_model_path is not None:
+      detector = cv2.FaceDetectorYN.create(
+          settings.dnn_model_path,
+          "",
+          (im.width, im.height),
+          0.9, # score threshold
+          0.3, # nms threshold
+          5000 # keep top k before nms
+      )
+      faces = detector.detect(np.array(im))
+      results = []
+      if faces[1] is not None:
+        for face in faces[1]:
+          x = face[0]
+          y = face[1]
+          w = face[2]
+          h = face[3]
+          results.append(
+            PointOfInterest(
+              int(x + (w * 0.5)), # face focus left/right is center
+              int(y + (h * 0.33)), # face focus up/down is close to the top of the head
+              size = w,
+              weight = 1/len(faces[1])
+            )
+          )
+      return results
+    else:
+      np_im = np.array(im)
+      gray = cv2.cvtColor(np_im, cv2.COLOR_BGR2GRAY)
+
+      tries = [
+        [ f'{cv2.data.haarcascades}haarcascade_eye.xml', 0.01 ],
+        [ f'{cv2.data.haarcascades}haarcascade_frontalface_default.xml', 0.05 ],
+        [ f'{cv2.data.haarcascades}haarcascade_profileface.xml', 0.05 ],
+        [ f'{cv2.data.haarcascades}haarcascade_frontalface_alt.xml', 0.05 ],
+        [ f'{cv2.data.haarcascades}haarcascade_frontalface_alt2.xml', 0.05 ],
+        [ f'{cv2.data.haarcascades}haarcascade_frontalface_alt_tree.xml', 0.05 ],
+        [ f'{cv2.data.haarcascades}haarcascade_eye_tree_eyeglasses.xml', 0.05 ],
+        [ f'{cv2.data.haarcascades}haarcascade_upperbody.xml', 0.05 ]
+      ]
+      for t in tries:
+        classifier = cv2.CascadeClassifier(t[0])
+        minsize = int(min(im.width, im.height) * t[1]) # at least N percent of the smallest side
+        try:
+          faces = classifier.detectMultiScale(gray, scaleFactor=1.1,
+            minNeighbors=7, minSize=(minsize, minsize), flags=cv2.CASCADE_SCALE_IMAGE)
+        except:
+          continue
+
+        if len(faces) > 0:
+          rects = [[f[0], f[1], f[0] + f[2], f[1] + f[3]] for f in faces]
+          return [PointOfInterest((r[0] +r[2]) // 2, (r[1] + r[3]) // 2, size=abs(r[0]-r[2]), weight=1/len(rects)) for r in rects]
+    return []
+
+
+def image_corner_points(im, settings):
+    grayscale = im.convert("L")
+
+    # naive attempt at preventing focal points from collecting at watermarks near the bottom
+    gd = ImageDraw.Draw(grayscale)
+    gd.rectangle([0, im.height*.9, im.width, im.height], fill="#999")
+
+    np_im = np.array(grayscale)
+
+    points = cv2.goodFeaturesToTrack(
+        np_im,
+        maxCorners=100,
+        qualityLevel=0.04,
+        minDistance=min(grayscale.width, grayscale.height)*0.06,
+        useHarrisDetector=False,
+    )
+
+    if points is None:
+        return []
+
+    focal_points = []
+    for point in points:
+      x, y = point.ravel()
+      focal_points.append(PointOfInterest(x, y, size=4, weight=1/len(points)))
+
+    return focal_points
+
+
+def image_entropy_points(im, settings):
+    landscape = im.height < im.width
+    portrait = im.height > im.width
+    if landscape:
+      move_idx = [0, 2]
+      move_max = im.size[0]
+    elif portrait:
+      move_idx = [1, 3]
+      move_max = im.size[1]
+    else:
+      return []
+
+    e_max = 0
+    crop_current = [0, 0, settings.crop_width, settings.crop_height]
+    crop_best = crop_current
+    while crop_current[move_idx[1]] < move_max:
+        crop = im.crop(tuple(crop_current))
+        e = image_entropy(crop)
+
+        if (e > e_max):
+          e_max = e
+          crop_best = list(crop_current)
+
+        crop_current[move_idx[0]] += 4
+        crop_current[move_idx[1]] += 4
+
+    x_mid = int(crop_best[0] + settings.crop_width/2)
+    y_mid = int(crop_best[1] + settings.crop_height/2)
+
+    return [PointOfInterest(x_mid, y_mid, size=25, weight=1.0)]
+
+
+def image_entropy(im):
+    # greyscale image entropy
+    # band = np.asarray(im.convert("L"))
+    band = np.asarray(im.convert("1"), dtype=np.uint8)
+    hist, _ = np.histogram(band, bins=range(0, 256))
+    hist = hist[hist > 0]
+    return -np.log2(hist / hist.sum()).sum()
+
+def centroid(pois):
+  x = [poi.x for poi in pois]
+  y = [poi.y for poi in pois]
+  return PointOfInterest(sum(x)/len(pois), sum(y)/len(pois))
+
+
+def poi_average(pois, settings):
+    weight = 0.0
+    x = 0.0
+    y = 0.0
+    for poi in pois:
+        weight += poi.weight
+        x += poi.x * poi.weight
+        y += poi.y * poi.weight
+    avg_x = round(weight and x / weight)
+    avg_y = round(weight and y / weight)
+
+    return PointOfInterest(avg_x, avg_y)
+
+
+def is_landscape(w, h):
+  return w > h
+
+
+def is_portrait(w, h):
+  return h > w
+
+
+def is_square(w, h):
+  return w == h
+
+
+def download_and_cache_models(dirname):
+  download_url = 'https://github.com/opencv/opencv_zoo/blob/91fb0290f50896f38a0ab1e558b74b16bc009428/models/face_detection_yunet/face_detection_yunet_2022mar.onnx?raw=true'
+  model_file_name = 'face_detection_yunet.onnx'
+
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+
+  cache_file = os.path.join(dirname, model_file_name)
+  if not os.path.exists(cache_file):
+    print(f"downloading face detection model from '{download_url}' to '{cache_file}'")
+    response = requests.get(download_url)
+    with open(cache_file, "wb") as f:
+      f.write(response.content)
+
+  if os.path.exists(cache_file):
+    return cache_file
+  return None
+
+
+class PointOfInterest:
+  def __init__(self, x, y, weight=1.0, size=10):
+    self.x = x
+    self.y = y
+    self.weight = weight
+    self.size = size
+
+  def bounding(self, size):
+    return [
+      self.x - size//2,
+      self.y - size//2,
+      self.x + size//2,
+      self.y + size//2
+    ]
+
+
+class Settings:
+  def __init__(self, crop_width=512, crop_height=512, corner_points_weight=0.5, entropy_points_weight=0.5, face_points_weight=0.5, annotate_image=False, dnn_model_path=None):
+    self.crop_width = crop_width
+    self.crop_height = crop_height
+    self.corner_points_weight = corner_points_weight
+    self.entropy_points_weight = entropy_points_weight
+    self.face_points_weight = face_points_weight
+    self.annotate_image = annotate_image
+    self.destop_view_image = False
+    self.dnn_model_path = dnn_model_path
--- a/modules/textual_inversion/dataset.py
+++ b/modules/textual_inversion/dataset.py
@@ -3,35 +3,38 @@ import numpy as np
 import PIL
 import torch
 from PIL import Image
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, DataLoader, Sampler
 from torchvision import transforms
+from collections import defaultdict
+from random import shuffle, choices

 import random
 import tqdm
 from modules import devices, shared
 import re

+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+
 re_numbers_at_start = re.compile(r"^[-\d]+\s*")


 class DatasetEntry:
-    def __init__(self, filename=None, latent=None, filename_text=None):
+    def __init__(self, filename=None, filename_text=None, latent_dist=None, latent_sample=None, cond=None, cond_text=None, pixel_values=None):
        self.filename = filename
-        self.latent = latent
        self.filename_text = filename_text
-        self.cond = None
-        self.cond_text = None
+        self.latent_dist = latent_dist
+        self.latent_sample = latent_sample
+        self.cond = cond
+        self.cond_text = cond_text
+        self.pixel_values = pixel_values


 class PersonalizedBase(Dataset):
-    def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_token="*", model=None, device=None, template_file=None, include_cond=False, batch_size=1):
+    def __init__(self, data_root, width, height, repeats, flip_p=0.5, placeholder_token="*", model=None, cond_model=None, device=None, template_file=None, include_cond=False, batch_size=1, gradient_step=1, shuffle_tags=False, tag_drop_out=0, latent_sampling_method='once', varsize=False):
        re_word = re.compile(shared.opts.dataset_filename_word_regex) if len(shared.opts.dataset_filename_word_regex) > 0 else None

        self.placeholder_token = placeholder_token

-        self.batch_size = batch_size
-        self.width = width
-        self.height = height
        self.flip = transforms.RandomHorizontalFlip(p=flip_p)

        self.dataset = []
@@ -42,14 +45,23 @@ class PersonalizedBase(Dataset):
        self.lines = lines

        assert data_root, 'dataset directory not specified'
-
-        cond_model = shared.sd_model.cond_stage_model
+        assert os.path.isdir(data_root), "Dataset directory doesn't exist"
+        assert os.listdir(data_root), "Dataset directory is empty"

        self.image_paths = [os.path.join(data_root, file_path) for file_path in os.listdir(data_root)]
+
+        self.shuffle_tags = shuffle_tags
+        self.tag_drop_out = tag_drop_out
+        groups = defaultdict(list)
+
        print("Preparing dataset...")
        for path in tqdm.tqdm(self.image_paths):
+            if shared.state.interrupted:
+                raise Exception("interrupted")
            try:
-                image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.BICUBIC)
+                image = Image.open(path).convert('RGB')
+                if not varsize:
+                    image = image.resize((width, height), PIL.Image.BICUBIC)
            except Exception:
                continue

@@ -69,53 +81,136 @@ class PersonalizedBase(Dataset):
            npimage = np.array(image).astype(np.uint8)
            npimage = (npimage / 127.5 - 1.0).astype(np.float32)

-            torchdata = torch.from_numpy(npimage).to(device=device, dtype=torch.float32)
-            torchdata = torch.moveaxis(torchdata, 2, 0)
+            torchdata = torch.from_numpy(npimage).permute(2, 0, 1).to(device=device, dtype=torch.float32)
+            latent_sample = None

-            init_latent = model.get_first_stage_encoding(model.encode_first_stage(torchdata.unsqueeze(dim=0))).squeeze()
-            init_latent = init_latent.to(devices.cpu)
+            with devices.autocast():
+                latent_dist = model.encode_first_stage(torchdata.unsqueeze(dim=0))

-            entry = DatasetEntry(filename=path, filename_text=filename_text, latent=init_latent)
+            if latent_sampling_method == "once" or (latent_sampling_method == "deterministic" and not isinstance(latent_dist, DiagonalGaussianDistribution)):
+                latent_sample = model.get_first_stage_encoding(latent_dist).squeeze().to(devices.cpu)
+                latent_sampling_method = "once"
+                entry = DatasetEntry(filename=path, filename_text=filename_text, latent_sample=latent_sample)
+            elif latent_sampling_method == "deterministic":
+                # Works only for DiagonalGaussianDistribution
+                latent_dist.std = 0
+                latent_sample = model.get_first_stage_encoding(latent_dist).squeeze().to(devices.cpu)
+                entry = DatasetEntry(filename=path, filename_text=filename_text, latent_sample=latent_sample)
+            elif latent_sampling_method == "random":
+                entry = DatasetEntry(filename=path, filename_text=filename_text, latent_dist=latent_dist)

-            if include_cond:
+            if not (self.tag_drop_out != 0 or self.shuffle_tags):
                entry.cond_text = self.create_text(filename_text)
-                entry.cond = cond_model([entry.cond_text]).to(devices.cpu).squeeze(0)

+            if include_cond and not (self.tag_drop_out != 0 or self.shuffle_tags):
+                with devices.autocast():
+                    entry.cond = cond_model([entry.cond_text]).to(devices.cpu).squeeze(0)
+            groups[image.size].append(len(self.dataset))
            self.dataset.append(entry)
+            del torchdata
+            del latent_dist
+            del latent_sample

-        assert len(self.dataset) > 1, "No images have been found in the dataset."
-        self.length = len(self.dataset) * repeats // batch_size
+        self.length = len(self.dataset)
+        self.groups = list(groups.values())
+        assert self.length > 0, "No images have been found in the dataset."
+        self.batch_size = min(batch_size, self.length)
+        self.gradient_step = min(gradient_step, self.length // self.batch_size)
+        self.latent_sampling_method = latent_sampling_method

-        self.initial_indexes = np.arange(len(self.dataset))
-        self.indexes = None
-        self.shuffle()
-
-    def shuffle(self):
-        self.indexes = self.initial_indexes[torch.randperm(self.initial_indexes.shape[0])]
+        if len(groups) > 1:
+            print("Buckets:")
+            for (w, h), ids in sorted(groups.items(), key=lambda x: x[0]):
+                print(f"  {w}x{h}: {len(ids)}")
+            print()

    def create_text(self, filename_text):
        text = random.choice(self.lines)
+        tags = filename_text.split(',')
+        if self.tag_drop_out != 0:
+            tags = [t for t in tags if random.random() > self.tag_drop_out]
+        if self.shuffle_tags:
+            random.shuffle(tags)
+        text = text.replace("[filewords]", ','.join(tags))
        text = text.replace("[name]", self.placeholder_token)
-        text = text.replace("[filewords]", filename_text)
        return text

    def __len__(self):
        return self.length

    def __getitem__(self, i):
-        res = []
+        entry = self.dataset[i]
+        if self.tag_drop_out != 0 or self.shuffle_tags:
+            entry.cond_text = self.create_text(entry.filename_text)
+        if self.latent_sampling_method == "random":
+            entry.latent_sample = shared.sd_model.get_first_stage_encoding(entry.latent_dist).to(devices.cpu)
+        return entry

-        for j in range(self.batch_size):
-            position = i * self.batch_size + j
-            if position % len(self.indexes) == 0:
-                self.shuffle()

-            index = self.indexes[position % len(self.indexes)]
-            entry = self.dataset[index]
+class GroupedBatchSampler(Sampler):
+    def __init__(self, data_source: PersonalizedBase, batch_size: int):
+        super().__init__(data_source)

-            if entry.cond is None:
-                entry.cond_text = self.create_text(entry.filename_text)
+        n = len(data_source)
+        self.groups = data_source.groups
+        self.len = n_batch = n // batch_size
+        expected = [len(g) / n * n_batch * batch_size for g in data_source.groups]
+        self.base = [int(e) // batch_size for e in expected]
+        self.n_rand_batches = nrb = n_batch - sum(self.base)
+        self.probs = [e%batch_size/nrb/batch_size if nrb>0 else 0 for e in expected]
+        self.batch_size = batch_size

-            res.append(entry)
+    def __len__(self):
+        return self.len

-        return res
+    def __iter__(self):
+        b = self.batch_size
+
+        for g in self.groups:
+            shuffle(g)
+
+        batches = []
+        for g in self.groups:
+            batches.extend(g[i*b:(i+1)*b] for i in range(len(g) // b))
+        for _ in range(self.n_rand_batches):
+            rand_group = choices(self.groups, self.probs)[0]
+            batches.append(choices(rand_group, k=b))
+
+        shuffle(batches)
+
+        yield from batches
+
+
+class PersonalizedDataLoader(DataLoader):
+    def __init__(self, dataset, latent_sampling_method="once", batch_size=1, pin_memory=False):
+        super(PersonalizedDataLoader, self).__init__(dataset, batch_sampler=GroupedBatchSampler(dataset, batch_size), pin_memory=pin_memory)
+        if latent_sampling_method == "random":
+            self.collate_fn = collate_wrapper_random
+        else:
+            self.collate_fn = collate_wrapper
+
+
+class BatchLoader:
+    def __init__(self, data):
+        self.cond_text = [entry.cond_text for entry in data]
+        self.cond = [entry.cond for entry in data]
+        self.latent_sample = torch.stack([entry.latent_sample for entry in data]).squeeze(1)
+        #self.emb_index = [entry.emb_index for entry in data]
+        #print(self.latent_sample.device)
+
+    def pin_memory(self):
+        self.latent_sample = self.latent_sample.pin_memory()
+        return self
+
+def collate_wrapper(batch):
+    return BatchLoader(batch)
+
+class BatchLoaderRandom(BatchLoader):
+    def __init__(self, data):
+        super().__init__(data)
+
+    def pin_memory(self):
+        return self
+
+def collate_wrapper_random(batch):
+    return BatchLoaderRandom(batch)
--- a/modules/textual_inversion/image_embedding.py
+++ b/modules/textual_inversion/image_embedding.py
@@ -5,6 +5,7 @@ import zlib
 from PIL import Image, PngImagePlugin, ImageDraw, ImageFont
 from fonts.ttf import Roboto
 import torch
+from modules.shared import opts


 class EmbeddingEncoder(json.JSONEncoder):
@@ -75,10 +76,10 @@ def insert_image_data_embed(image, data):
    next_size = data_np_low.shape[0] + (h-(data_np_low.shape[0] % h))
    next_size = next_size + ((h*d)-(next_size % (h*d)))

-    data_np_low.resize(next_size)
+    data_np_low = np.resize(data_np_low, next_size)
    data_np_low = data_np_low.reshape((h, -1, d))

-    data_np_high.resize(next_size)
+    data_np_high = np.resize(data_np_high, next_size)
    data_np_high = data_np_high.reshape((h, -1, d))

    edge_style = list(data['string_to_param'].values())[0].cpu().detach().numpy().tolist()[0][:1024]
@@ -133,7 +134,7 @@ def caption_image_overlay(srcimage, title, footerLeft, footerMid, footerRight, t
    from math import cos

    image = srcimage.copy()
-
+    fontsize = 32
    if textfont is None:
        try:
            textfont = ImageFont.truetype(opts.font or Roboto, fontsize)
@@ -150,7 +151,7 @@ def caption_image_overlay(srcimage, title, footerLeft, footerMid, footerRight, t
    image = Image.alpha_composite(image.convert('RGBA'), gradient.resize(image.size))

    draw = ImageDraw.Draw(image)
-    fontsize = 32
+
    font = ImageFont.truetype(textfont, fontsize)
    padding = 10

--- a/modules/textual_inversion/learn_schedule.py
+++ b/modules/textual_inversion/learn_schedule.py
@@ -4,30 +4,37 @@ import tqdm
 class LearnScheduleIterator:
    def __init__(self, learn_rate, max_steps, cur_step=0):
        """
-        specify learn_rate as "0.001:100, 0.00001:1000, 1e-5:10000" to have lr of 0.001 until step 100, 0.00001 until 1000, 1e-5:10000 until 10000
+        specify learn_rate as "0.001:100, 0.00001:1000, 1e-5:10000" to have lr of 0.001 until step 100, 0.00001 until 1000, and 1e-5 until 10000
        """

        pairs = learn_rate.split(',')
        self.rates = []
        self.it = 0
        self.maxit = 0
-        for i, pair in enumerate(pairs):
-            tmp = pair.split(':')
-            if len(tmp) == 2:
-                step = int(tmp[1])
-                if step > cur_step:
-                    self.rates.append((float(tmp[0]), min(step, max_steps)))
-                    self.maxit += 1
-                    if step > max_steps:
+        try:
+            for i, pair in enumerate(pairs):
+                if not pair.strip():
+                    continue
+                tmp = pair.split(':')
+                if len(tmp) == 2:
+                    step = int(tmp[1])
+                    if step > cur_step:
+                        self.rates.append((float(tmp[0]), min(step, max_steps)))
+                        self.maxit += 1
+                        if step > max_steps:
+                            return
+                    elif step == -1:
+                        self.rates.append((float(tmp[0]), max_steps))
+                        self.maxit += 1
                        return
-                elif step == -1:
+                else:
                    self.rates.append((float(tmp[0]), max_steps))
                    self.maxit += 1
                    return
-            else:
-                self.rates.append((float(tmp[0]), max_steps))
-                self.maxit += 1
-                return
+            assert self.rates
+        except (ValueError, AssertionError):
+            raise Exception('Invalid learning rate schedule. It should be a number or, for example, like "0.001:100, 0.00001:1000, 1e-5:10000" to have lr of 0.001 until step 100, 0.00001 until 1000, and 1e-5 until 10000.')
+

    def __iter__(self):
        return self
@@ -51,14 +58,19 @@ class LearnRateScheduler:

        self.finished = False

-    def apply(self, optimizer, step_number):
-        if step_number <= self.end_step:
-            return
+    def step(self, step_number):
+        if step_number < self.end_step:
+            return False

        try:
            (self.learn_rate, self.end_step) = next(self.schedules)
-        except Exception:
+        except StopIteration:
            self.finished = True
+            return False
+        return True
+
+    def apply(self, optimizer, step_number):
+        if not self.step(step_number):
            return

        if self.verbose:
--- a/modules/textual_inversion/logging.py
+++ b/modules/textual_inversion/logging.py
@@ -0,0 +1,24 @@
+import datetime
+import json
+import os
+
+saved_params_shared = {"model_name", "model_hash", "initial_step", "num_of_dataset_images", "learn_rate", "batch_size", "clip_grad_mode", "clip_grad_value", "gradient_step", "data_root", "log_directory", "training_width", "training_height", "steps", "create_image_every", "template_file"}
+saved_params_ti = {"embedding_name", "num_vectors_per_token", "save_embedding_every", "save_image_with_stored_embedding"}
+saved_params_hypernet = {"hypernetwork_name", "layer_structure", "activation_func", "weight_init", "add_layer_norm", "use_dropout", "save_hypernetwork_every"}
+saved_params_all = saved_params_shared | saved_params_ti | saved_params_hypernet
+saved_params_previews = {"preview_prompt", "preview_negative_prompt", "preview_steps", "preview_sampler_index", "preview_cfg_scale", "preview_seed", "preview_width", "preview_height"}
+
+
+def save_settings_to_file(log_directory, all_params):
+    now = datetime.datetime.now()
+    params = {"datetime": now.strftime("%Y-%m-%d %H:%M:%S")}
+
+    keys = saved_params_all
+    if all_params.get('preview_from_txt2img'):
+        keys = keys | saved_params_previews
+
+    params.update({k: v for k, v in all_params.items() if k in keys})
+
+    filename = f'settings-{now.strftime("%Y-%m-%d-%H-%M-%S")}.json'
+    with open(os.path.join(log_directory, filename), "w") as file:
+        json.dump(params, file, indent=4)
--- a/modules/textual_inversion/preprocess.py
+++ b/modules/textual_inversion/preprocess.py
@@ -1,27 +1,26 @@
 import os
 from PIL import Image, ImageOps
+import math
 import platform
 import sys
 import tqdm
 import time

-from modules import shared, images
+from modules import shared, images, deepbooru
+from modules.paths import models_path
 from modules.shared import opts, cmd_opts
-if cmd_opts.deepdanbooru:
-    import modules.deepbooru as deepbooru
+from modules.textual_inversion import autocrop


-def preprocess(process_src, process_dst, process_width, process_height, process_flip, process_split, process_caption, process_caption_deepbooru=False):
+def preprocess(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
    try:
        if process_caption:
            shared.interrogator.load()

        if process_caption_deepbooru:
-            db_opts = deepbooru.create_deepbooru_opts()
-            db_opts[deepbooru.OPT_INCLUDE_RANKS] = False
-            deepbooru.create_deepbooru_process(opts.interrogate_deepbooru_score_threshold, db_opts)
+            deepbooru.model.start()

-        preprocess_work(process_src, process_dst, process_width, process_height, process_flip, process_split, process_caption, process_caption_deepbooru)
+        preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru, split_threshold, overlap_ratio, process_focal_crop, process_focal_crop_face_weight, process_focal_crop_entropy_weight, process_focal_crop_edges_weight, process_focal_crop_debug)

    finally:

@@ -29,88 +28,174 @@ def preprocess(process_src, process_dst, process_width, process_height, process_
            shared.interrogator.send_blip_to_ram()

        if process_caption_deepbooru:
-            deepbooru.release_process()
+            deepbooru.model.stop()


+def listfiles(dirname):
+    return os.listdir(dirname)

-def preprocess_work(process_src, process_dst, process_width, process_height, process_flip, process_split, process_caption, process_caption_deepbooru=False):
+
+class PreprocessParams:
+    src = None
+    dstdir = None
+    subindex = 0
+    flip = False
+    process_caption = False
+    process_caption_deepbooru = False
+    preprocess_txt_action = None
+
+
+def save_pic_with_caption(image, index, params: PreprocessParams, existing_caption=None):
+    caption = ""
+
+    if params.process_caption:
+        caption += shared.interrogator.generate_caption(image)
+
+    if params.process_caption_deepbooru:
+        if len(caption) > 0:
+            caption += ", "
+        caption += deepbooru.model.tag_multi(image)
+
+    filename_part = params.src
+    filename_part = os.path.splitext(filename_part)[0]
+    filename_part = os.path.basename(filename_part)
+
+    basename = f"{index:05}-{params.subindex}-{filename_part}"
+    image.save(os.path.join(params.dstdir, f"{basename}.png"))
+
+    if params.preprocess_txt_action == 'prepend' and existing_caption:
+        caption = existing_caption + ' ' + caption
+    elif params.preprocess_txt_action == 'append' and existing_caption:
+        caption = caption + ' ' + existing_caption
+    elif params.preprocess_txt_action == 'copy' and existing_caption:
+        caption = existing_caption
+
+    caption = caption.strip()
+
+    if len(caption) > 0:
+        with open(os.path.join(params.dstdir, f"{basename}.txt"), "w", encoding="utf8") as file:
+            file.write(caption)
+
+    params.subindex += 1
+
+
+def save_pic(image, index, params, existing_caption=None):
+    save_pic_with_caption(image, index, params, existing_caption=existing_caption)
+
+    if params.flip:
+        save_pic_with_caption(ImageOps.mirror(image), index, params, existing_caption=existing_caption)
+
+
+def split_pic(image, inverse_xy, width, height, overlap_ratio):
+    if inverse_xy:
+        from_w, from_h = image.height, image.width
+        to_w, to_h = height, width
+    else:
+        from_w, from_h = image.width, image.height
+        to_w, to_h = width, height
+    h = from_h * to_w // from_w
+    if inverse_xy:
+        image = image.resize((h, to_w))
+    else:
+        image = image.resize((to_w, h))
+
+    split_count = math.ceil((h - to_h * overlap_ratio) / (to_h * (1.0 - overlap_ratio)))
+    y_step = (h - to_h) / (split_count - 1)
+    for i in range(split_count):
+        y = int(y_step * i)
+        if inverse_xy:
+            splitted = image.crop((y, 0, y + to_h, to_w))
+        else:
+            splitted = image.crop((0, y, to_w, y + to_h))
+        yield splitted
+
+
+def preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
    width = process_width
    height = process_height
    src = os.path.abspath(process_src)
    dst = os.path.abspath(process_dst)
+    split_threshold = max(0.0, min(1.0, split_threshold))
+    overlap_ratio = max(0.0, min(0.9, overlap_ratio))

    assert src != dst, 'same directory specified as source and destination'

    os.makedirs(dst, exist_ok=True)

-    files = os.listdir(src)
+    files = listfiles(src)

+    shared.state.job = "preprocess"
    shared.state.textinfo = "Preprocessing..."
    shared.state.job_count = len(files)

-    def save_pic_with_caption(image, index):
-        caption = ""
+    params = PreprocessParams()
+    params.dstdir = dst
+    params.flip = process_flip
+    params.process_caption = process_caption
+    params.process_caption_deepbooru = process_caption_deepbooru
+    params.preprocess_txt_action = preprocess_txt_action

-        if process_caption:
-            caption += shared.interrogator.generate_caption(image)
-
-        if process_caption_deepbooru:
-            if len(caption) > 0:
-                caption += ", "
-            caption += deepbooru.get_tags_from_process(image)
-
-        filename_part = filename
-        filename_part = os.path.splitext(filename_part)[0]
-        filename_part = os.path.basename(filename_part)
-
-        basename = f"{index:05}-{subindex[0]}-{filename_part}"
-        image.save(os.path.join(dst, f"{basename}.png"))
-
-        if len(caption) > 0:
-            with open(os.path.join(dst, f"{basename}.txt"), "w", encoding="utf8") as file:
-                file.write(caption)
-
-        subindex[0] += 1
-
-    def save_pic(image, index):
-        save_pic_with_caption(image, index)
-
-        if process_flip:
-            save_pic_with_caption(ImageOps.mirror(image), index)
-
-    for index, imagefile in enumerate(tqdm.tqdm(files)):
-        subindex = [0]
+    pbar = tqdm.tqdm(files)
+    for index, imagefile in enumerate(pbar):
+        params.subindex = 0
        filename = os.path.join(src, imagefile)
        try:
            img = Image.open(filename).convert("RGB")
        except Exception:
            continue

+        description = f"Preprocessing [Image {index}/{len(files)}]"
+        pbar.set_description(description)
+        shared.state.textinfo = description
+
+        params.src = filename
+
+        existing_caption = None
+        existing_caption_filename = os.path.splitext(filename)[0] + '.txt'
+        if os.path.exists(existing_caption_filename):
+            with open(existing_caption_filename, 'r', encoding="utf8") as file:
+                existing_caption = file.read()
+
        if shared.state.interrupted:
            break

-        ratio = img.height / img.width
-        is_tall = ratio > 1.35
-        is_wide = ratio < 1 / 1.35
-
-        if process_split and is_tall:
-            img = img.resize((width, height * img.height // img.width))
-
-            top = img.crop((0, 0, width, height))
-            save_pic(top, index)
-
-            bot = img.crop((0, img.height - height, width, img.height))
-            save_pic(bot, index)
-        elif process_split and is_wide:
-            img = img.resize((width * img.width // img.height, height))
-
-            left = img.crop((0, 0, width, height))
-            save_pic(left, index)
-
-            right = img.crop((img.width - width, 0, img.width, height))
-            save_pic(right, index)
+        if img.height > img.width:
+            ratio = (img.width * height) / (img.height * width)
+            inverse_xy = False
        else:
+            ratio = (img.height * width) / (img.width * height)
+            inverse_xy = True
+
+        process_default_resize = True
+
+        if process_split and ratio < 1.0 and ratio <= split_threshold:
+            for splitted in split_pic(img, inverse_xy, width, height, overlap_ratio):
+                save_pic(splitted, index, params, existing_caption=existing_caption)
+            process_default_resize = False
+
+        if process_focal_crop and img.height != img.width:
+
+            dnn_model_path = None
+            try:
+                dnn_model_path = autocrop.download_and_cache_models(os.path.join(models_path, "opencv"))
+            except Exception as e:
+                print("Unable to load face detection model for auto crop selection. Falling back to lower quality haar method.", e)
+
+            autocrop_settings = autocrop.Settings(
+                crop_width = width,
+                crop_height = height,
+                face_points_weight = process_focal_crop_face_weight,
+                entropy_points_weight = process_focal_crop_entropy_weight,
+                corner_points_weight = process_focal_crop_edges_weight,
+                annotate_image = process_focal_crop_debug,
+                dnn_model_path = dnn_model_path,
+            )
+            for focal in autocrop.crop_image(img, autocrop_settings):
+                save_pic(focal, index, params, existing_caption=existing_caption)
+            process_default_resize = False
+
+        if process_default_resize:
            img = images.resize_image(1, img, width, height)
-            save_pic(img, index)
+            save_pic(img, index, params, existing_caption=existing_caption)

        shared.state.nextjob()
--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
@@ -1,32 +1,54 @@
 import os
 import sys
 import traceback
+import inspect
+from collections import namedtuple

 import torch
 import tqdm
 import html
 import datetime
 import csv
-import numpy as np
+import safetensors.torch

 from PIL import Image, PngImagePlugin
 from torch.utils.tensorboard import SummaryWriter
-from modules import shared, devices, sd_hijack, processing, sd_models
+
+from modules import shared, devices, sd_hijack, processing, sd_models, images, sd_samplers
 import modules.textual_inversion.dataset
 from modules.textual_inversion.learn_schedule import LearnRateScheduler

-from modules.textual_inversion.image_embedding import (embedding_to_b64, embedding_from_b64,
-                                                       insert_image_data_embed, extract_image_data_embed,
-                                                       caption_image_overlay)
+from modules.textual_inversion.image_embedding import embedding_to_b64, embedding_from_b64, insert_image_data_embed, extract_image_data_embed, caption_image_overlay
+from modules.textual_inversion.logging import save_settings_to_file
+
+
+TextualInversionTemplate = namedtuple("TextualInversionTemplate", ["name", "path"])
+textual_inversion_templates = {}
+
+
+def list_textual_inversion_templates():
+    textual_inversion_templates.clear()
+
+    for root, dirs, fns in os.walk(shared.cmd_opts.textual_inversion_templates_dir):
+        for fn in fns:
+            path = os.path.join(root, fn)
+
+            textual_inversion_templates[fn] = TextualInversionTemplate(fn, path)
+
+    return textual_inversion_templates
+

 class Embedding:
    def __init__(self, vec, name, step=None):
        self.vec = vec
        self.name = name
        self.step = step
+        self.shape = None
+        self.vectors = 0
        self.cached_checksum = None
        self.sd_checkpoint = None
        self.sd_checkpoint_name = None
+        self.optimizer_state_dict = None

    def save(self, filename):
        embedding_data = {
@@ -40,6 +62,13 @@ class Embedding:

        torch.save(embedding_data, filename)

+        if shared.opts.save_optimizer_state and self.optimizer_state_dict is not None:
+            optimizer_saved_dict = {
+                'hash': self.checksum(),
+                'optimizer_state_dict': self.optimizer_state_dict,
+            }
+            torch.save(optimizer_saved_dict, filename + '.optim')
+
    def checksum(self):
        if self.cached_checksum is not None:
            return self.cached_checksum
@@ -54,18 +83,44 @@ class Embedding:
        return self.cached_checksum


+class DirWithTextualInversionEmbeddings:
+    def __init__(self, path):
+        self.path = path
+        self.mtime = None
+
+    def has_changed(self):
+        if not os.path.isdir(self.path):
+            return False
+
+        mt = os.path.getmtime(self.path)
+        if self.mtime is None or mt > self.mtime:
+            return True
+
+    def update(self):
+        if not os.path.isdir(self.path):
+            return
+
+        self.mtime = os.path.getmtime(self.path)
+
+
 class EmbeddingDatabase:
-    def __init__(self, embeddings_dir):
+    def __init__(self):
        self.ids_lookup = {}
        self.word_embeddings = {}
-        self.dir_mtime = None
-        self.embeddings_dir = embeddings_dir
+        self.skipped_embeddings = {}
+        self.expected_shape = -1
+        self.embedding_dirs = {}
+
+    def add_embedding_dir(self, path):
+        self.embedding_dirs[path] = DirWithTextualInversionEmbeddings(path)
+
+    def clear_embedding_dirs(self):
+        self.embedding_dirs.clear()

    def register_embedding(self, embedding, model):
-
        self.word_embeddings[embedding.name] = embedding

-        ids = model.cond_stage_model.tokenizer([embedding.name], add_special_tokens=False)['input_ids'][0]
+        ids = model.cond_stage_model.tokenize([embedding.name])[0]

        first_id = ids[0]
        if first_id not in self.ids_lookup:
@@ -75,70 +130,104 @@ class EmbeddingDatabase:

        return embedding

-    def load_textual_inversion_embeddings(self):
-        mt = os.path.getmtime(self.embeddings_dir)
-        if self.dir_mtime is not None and mt <= self.dir_mtime:
+    def get_expected_shape(self):
+        vec = shared.sd_model.cond_stage_model.encode_embedding_init_text(",", 1)
+        return vec.shape[1]
+
+    def load_from_file(self, path, filename):
+        name, ext = os.path.splitext(filename)
+        ext = ext.upper()
+
+        if ext in ['.PNG', '.WEBP', '.JXL', '.AVIF']:
+            _, second_ext = os.path.splitext(name)
+            if second_ext.upper() == '.PREVIEW':
+                return
+
+            embed_image = Image.open(path)
+            if hasattr(embed_image, 'text') and 'sd-ti-embedding' in embed_image.text:
+                data = embedding_from_b64(embed_image.text['sd-ti-embedding'])
+                name = data.get('name', name)
+            else:
+                data = extract_image_data_embed(embed_image)
+                name = data.get('name', name)
+        elif ext in ['.BIN', '.PT']:
+            data = torch.load(path, map_location="cpu")
+        elif ext in ['.SAFETENSORS']:
+            data = safetensors.torch.load_file(path, device="cpu")
+        else:
            return

-        self.dir_mtime = mt
-        self.ids_lookup.clear()
-        self.word_embeddings.clear()
+        # textual inversion embeddings
+        if 'string_to_param' in data:
+            param_dict = data['string_to_param']
+            if hasattr(param_dict, '_parameters'):
+                param_dict = getattr(param_dict, '_parameters')  # fix for torch 1.12.1 loading saved file from torch 1.11
+            assert len(param_dict) == 1, 'embedding file has multiple terms in it'
+            emb = next(iter(param_dict.items()))[1]
+        # diffuser concepts
+        elif type(data) == dict and type(next(iter(data.values()))) == torch.Tensor:
+            assert len(data.keys()) == 1, 'embedding file has multiple terms in it'

-        def process_file(path, filename):
-            name = os.path.splitext(filename)[0]
+            emb = next(iter(data.values()))
+            if len(emb.shape) == 1:
+                emb = emb.unsqueeze(0)
+        else:
+            raise Exception(f"Couldn't identify {filename} as neither textual inversion embedding nor diffuser concept.")

-            data = []
+        vec = emb.detach().to(devices.device, dtype=torch.float32)
+        embedding = Embedding(vec, name)
+        embedding.step = data.get('step', None)
+        embedding.sd_checkpoint = data.get('sd_checkpoint', None)
+        embedding.sd_checkpoint_name = data.get('sd_checkpoint_name', None)
+        embedding.vectors = vec.shape[0]
+        embedding.shape = vec.shape[-1]

-            if os.path.splitext(filename.upper())[-1] in ['.PNG', '.WEBP', '.JXL', '.AVIF']:
-                embed_image = Image.open(path)
-                if hasattr(embed_image, 'text') and 'sd-ti-embedding' in embed_image.text:
-                    data = embedding_from_b64(embed_image.text['sd-ti-embedding'])
-                    name = data.get('name', name)
-                else:
-                    data = extract_image_data_embed(embed_image)
-                    name = data.get('name', name)
-            else:
-                data = torch.load(path, map_location="cpu")
-
-            # textual inversion embeddings
-            if 'string_to_param' in data:
-                param_dict = data['string_to_param']
-                if hasattr(param_dict, '_parameters'):
-                    param_dict = getattr(param_dict, '_parameters')  # fix for torch 1.12.1 loading saved file from torch 1.11
-                assert len(param_dict) == 1, 'embedding file has multiple terms in it'
-                emb = next(iter(param_dict.items()))[1]
-            # diffuser concepts
-            elif type(data) == dict and type(next(iter(data.values()))) == torch.Tensor:
-                assert len(data.keys()) == 1, 'embedding file has multiple terms in it'
-
-                emb = next(iter(data.values()))
-                if len(emb.shape) == 1:
-                    emb = emb.unsqueeze(0)
-            else:
-                raise Exception(f"Couldn't identify {filename} as neither textual inversion embedding nor diffuser concept.")
-
-            vec = emb.detach().to(devices.device, dtype=torch.float32)
-            embedding = Embedding(vec, name)
-            embedding.step = data.get('step', None)
-            embedding.sd_checkpoint = data.get('hash', None)
-            embedding.sd_checkpoint_name = data.get('sd_checkpoint_name', None)
+        if self.expected_shape == -1 or self.expected_shape == embedding.shape:
            self.register_embedding(embedding, shared.sd_model)
+        else:
+            self.skipped_embeddings[name] = embedding

-        for fn in os.listdir(self.embeddings_dir):
-            try:
-                fullfn = os.path.join(self.embeddings_dir, fn)
+    def load_from_dir(self, embdir):
+        if not os.path.isdir(embdir.path):
+            return

-                if os.stat(fullfn).st_size == 0:
+        for root, dirs, fns in os.walk(embdir.path):
+            for fn in fns:
+                try:
+                    fullfn = os.path.join(root, fn)
+
+                    if os.stat(fullfn).st_size == 0:
+                        continue
+
+                    self.load_from_file(fullfn, fn)
+                except Exception:
+                    print(f"Error loading embedding {fn}:", file=sys.stderr)
+                    print(traceback.format_exc(), file=sys.stderr)
                    continue

-                process_file(fullfn, fn)
-            except Exception:
-                print(f"Error loading emedding {fn}:", file=sys.stderr)
-                print(traceback.format_exc(), file=sys.stderr)
-                continue
+    def load_textual_inversion_embeddings(self, force_reload=False):
+        if not force_reload:
+            need_reload = False
+            for path, embdir in self.embedding_dirs.items():
+                if embdir.has_changed():
+                    need_reload = True
+                    break

-        print(f"Loaded a total of {len(self.word_embeddings)} textual inversion embeddings.")
-        print("Embeddings:", ', '.join(self.word_embeddings.keys()))
+            if not need_reload:
+                return
+
+        self.ids_lookup.clear()
+        self.word_embeddings.clear()
+        self.skipped_embeddings.clear()
+        self.expected_shape = self.get_expected_shape()
+
+        for path, embdir in self.embedding_dirs.items():
+            self.load_from_dir(embdir)
+            embdir.update()
+
+        print(f"Textual inversion embeddings loaded({len(self.word_embeddings)}): {', '.join(self.word_embeddings.keys())}")
+        if len(self.skipped_embeddings) > 0:
+            print(f"Textual inversion embeddings skipped({len(self.skipped_embeddings)}): {', '.join(self.skipped_embeddings.keys())}")

    def find_embedding_at_position(self, tokens, offset):
        token = tokens[offset]
@@ -154,19 +243,26 @@ class EmbeddingDatabase:
        return None, None


-def create_embedding(name, num_vectors_per_token, init_text='*'):
+def create_embedding(name, num_vectors_per_token, overwrite_old, init_text='*'):
    cond_model = shared.sd_model.cond_stage_model
-    embedding_layer = cond_model.wrapped.transformer.text_model.embeddings

-    ids = cond_model.tokenizer(init_text, max_length=num_vectors_per_token, return_tensors="pt", add_special_tokens=False)["input_ids"]
-    embedded = embedding_layer.token_embedding.wrapped(ids.to(devices.device)).squeeze(0)
+    with devices.autocast():
+        cond_model([""])  # will send cond model to GPU if lowvram/medvram is active
+
+    #cond_model expects at least some text, so we provide '*' as backup.
+    embedded = cond_model.encode_embedding_init_text(init_text or '*', num_vectors_per_token)
    vec = torch.zeros((num_vectors_per_token, embedded.shape[1]), device=devices.device)

-    for i in range(num_vectors_per_token):
-        vec[i] = embedded[i * int(embedded.shape[0]) // num_vectors_per_token]
+    #Only copy if we provided an init_text, otherwise keep vectors as zeros
+    if init_text:
+        for i in range(num_vectors_per_token):
+            vec[i] = embedded[i * int(embedded.shape[0]) // num_vectors_per_token]

+    # Remove illegal characters from name.
+    name = "".join( x for x in name if (x.isalnum() or x in "._- "))
    fn = os.path.join(shared.cmd_opts.embeddings_dir, f"{name}.pt")
-    assert not os.path.exists(fn), f"file {fn} already exists"
+    if not overwrite_old:
+        assert not os.path.exists(fn), f"file {fn} already exists"

    embedding = Embedding(vec, name)
    embedding.step = 0
@@ -181,7 +277,6 @@ def write_loss(log_directory, filename, step, epoch_len, values):

    if step % shared.opts.training_write_csv_every != 0:
        return
-
    write_csv_header = False if os.path.exists(os.path.join(log_directory, filename)) else True

    with open(os.path.join(log_directory, filename), "a+", newline='') as fout:
@@ -190,13 +285,13 @@ def write_loss(log_directory, filename, step, epoch_len, values):
        if write_csv_header:
            csv_writer.writeheader()

-        epoch = step // epoch_len
-        epoch_step = step - epoch * epoch_len
+        epoch = (step - 1) // epoch_len
+        epoch_step = (step - 1) % epoch_len

        csv_writer.writerow({
-            "step": step + 1,
-            "epoch": epoch + 1,
-            "epoch_step": epoch_step + 1,
+            "step": step,
+            "epoch": epoch,
+            "epoch_step": epoch_step,
            **values,
        })

@@ -225,15 +320,45 @@ def tensorboard_add_image(tensorboard_writer, tag, pil_image, step):
                
    tensorboard_writer.add_image(tag, img_tensor, global_step=step)

-def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_directory, training_width, training_height, steps, create_image_every, save_embedding_every, template_file, save_image_with_stored_embedding, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height):
-    assert embedding_name, 'embedding not selected'
+def validate_train_inputs(model_name, learn_rate, batch_size, gradient_step, data_root, template_file, template_filename, steps, save_model_every, create_image_every, log_directory, name="embedding"):
+    assert model_name, f"{name} not selected"
+    assert learn_rate, "Learning rate is empty or 0"
+    assert isinstance(batch_size, int), "Batch size must be integer"
+    assert batch_size > 0, "Batch size must be positive"
+    assert isinstance(gradient_step, int), "Gradient accumulation step must be integer"
+    assert gradient_step > 0, "Gradient accumulation step must be positive"
+    assert data_root, "Dataset directory is empty"
+    assert os.path.isdir(data_root), "Dataset directory doesn't exist"
+    assert os.listdir(data_root), "Dataset directory is empty"
+    assert template_filename, "Prompt template file not selected"
+    assert template_file, f"Prompt template file {template_filename} not found"
+    assert os.path.isfile(template_file.path), f"Prompt template file {template_filename} doesn't exist"
+    assert steps, "Max steps is empty or 0"
+    assert isinstance(steps, int), "Max steps must be integer"
+    assert steps > 0, "Max steps must be positive"
+    assert isinstance(save_model_every, int), "Save {name} must be integer"
+    assert save_model_every >= 0, "Save {name} must be positive or 0"
+    assert isinstance(create_image_every, int), "Create image must be integer"
+    assert create_image_every >= 0, "Create image must be positive or 0"
+    if save_model_every or create_image_every:
+        assert log_directory, "Log directory is empty"

+
+def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, varsize, steps, clip_grad_mode, clip_grad_value, shuffle_tags, tag_drop_out, latent_sampling_method, create_image_every, save_embedding_every, template_filename, save_image_with_stored_embedding, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height):
+    save_embedding_every = save_embedding_every or 0
+    create_image_every = create_image_every or 0
+    template_file = textual_inversion_templates.get(template_filename, None)
+    validate_train_inputs(embedding_name, learn_rate, batch_size, gradient_step, data_root, template_file, template_filename, steps, save_embedding_every, create_image_every, log_directory, name="embedding")
+    template_file = template_file.path
+
+    shared.state.job = "train-embedding"
    shared.state.textinfo = "Initializing textual inversion training..."
    shared.state.job_count = steps

    filename = os.path.join(shared.cmd_opts.embeddings_dir, f'{embedding_name}.pt')

    log_directory = os.path.join(log_directory, datetime.datetime.now().strftime("%Y-%m-%d"), embedding_name)
+    unload = shared.opts.unload_models_when_training

    if save_embedding_every > 0:
        embedding_dir = os.path.join(log_directory, "embeddings")
@@ -252,160 +377,264 @@ def train_embedding(embedding_name, learn_rate, batch_size, data_root, log_direc
        os.makedirs(images_embeds_dir, exist_ok=True)
    else:
        images_embeds_dir = None
-        
-    cond_model = shared.sd_model.cond_stage_model
-
-    shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
-    with torch.autocast("cuda"):
-        ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=embedding_name, model=shared.sd_model, device=devices.device, template_file=template_file, batch_size=batch_size)

    hijack = sd_hijack.model_hijack

    embedding = hijack.embedding_db.word_embeddings[embedding_name]
-    embedding.vec.requires_grad = True
-
-    losses = torch.zeros((32,))
-
-    last_saved_file = "<none>"
-    last_saved_image = "<none>"
-    embedding_yet_to_be_embedded = False
+    checkpoint = sd_models.select_checkpoint()

    initial_step = embedding.step or 0
-    if initial_step > steps:
+    if initial_step >= steps:
+        shared.state.textinfo = "Model has already been trained beyond specified max steps"
        return embedding, filename
-
+    
    scheduler = LearnRateScheduler(learn_rate, steps, initial_step)
-    optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate)
-
+    clip_grad = torch.nn.utils.clip_grad_value_ if clip_grad_mode == "value" else \
+        torch.nn.utils.clip_grad_norm_ if clip_grad_mode == "norm" else \
+        None
+    if clip_grad:
+        clip_grad_sched = LearnRateScheduler(clip_grad_value, steps, initial_step, verbose=False)
+    # dataset loading may take a while, so input validations and early returns should be done before this
+    shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..."
+    old_parallel_processing_allowed = shared.parallel_processing_allowed
+    
    if shared.opts.training_enable_tensorboard:
        tensorboard_writer = tensorboard_setup(log_directory)

-    pbar = tqdm.tqdm(enumerate(ds), total=steps-initial_step)
-    for i, entries in pbar:
-        embedding.step = i + initial_step
+    pin_memory = shared.opts.pin_memory

-        scheduler.apply(optimizer, embedding.step)
-        if scheduler.finished:
-            break
+    ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=embedding_name, model=shared.sd_model, cond_model=shared.sd_model.cond_stage_model, device=devices.device, template_file=template_file, batch_size=batch_size, gradient_step=gradient_step, shuffle_tags=shuffle_tags, tag_drop_out=tag_drop_out, latent_sampling_method=latent_sampling_method, varsize=varsize)

-        if shared.state.interrupted:
-            break
+    if shared.opts.save_training_settings_to_txt:
+        save_settings_to_file(log_directory, {**dict(model_name=checkpoint.model_name, model_hash=checkpoint.hash, num_of_dataset_images=len(ds), num_vectors_per_token=len(embedding.vec)), **locals()})

-        with torch.autocast("cuda"):
-            c = cond_model([entry.cond_text for entry in entries])
-            x = torch.stack([entry.latent for entry in entries]).to(devices.device)
-            loss = shared.sd_model(x, c)[0]
-            del x
+    latent_sampling_method = ds.latent_sampling_method

-            losses[embedding.step % losses.shape[0]] = loss.item()
+    dl = modules.textual_inversion.dataset.PersonalizedDataLoader(ds, latent_sampling_method=latent_sampling_method, batch_size=ds.batch_size, pin_memory=pin_memory)
+
+    if unload:
+        shared.parallel_processing_allowed = False
+        shared.sd_model.first_stage_model.to(devices.cpu)
+
+    embedding.vec.requires_grad = True
+    optimizer = torch.optim.AdamW([embedding.vec], lr=scheduler.learn_rate, weight_decay=0.0)
+    if shared.opts.save_optimizer_state:
+        optimizer_state_dict = None
+        if os.path.exists(filename + '.optim'):
+            optimizer_saved_dict = torch.load(filename + '.optim', map_location='cpu')
+            if embedding.checksum() == optimizer_saved_dict.get('hash', None):
+                optimizer_state_dict = optimizer_saved_dict.get('optimizer_state_dict', None)
+    
+        if optimizer_state_dict is not None:
+            optimizer.load_state_dict(optimizer_state_dict)
+            print("Loaded existing optimizer from checkpoint")
+        else:
+            print("No saved optimizer exists in checkpoint")
+
+    scaler = torch.cuda.amp.GradScaler()
+
+    batch_size = ds.batch_size
+    gradient_step = ds.gradient_step
+    # n steps = batch_size * gradient_step * n image processed
+    steps_per_epoch = len(ds) // batch_size // gradient_step
+    max_steps_per_epoch = len(ds) // batch_size - (len(ds) // batch_size) % gradient_step
+    loss_step = 0
+    _loss_step = 0 #internal
+
+    last_saved_file = "<none>"
+    last_saved_image = "<none>"
+    forced_filename = "<none>"
+    embedding_yet_to_be_embedded = False
+
+    is_training_inpainting_model = shared.sd_model.model.conditioning_key in {'hybrid', 'concat'}
+    img_c = None
+
+    pbar = tqdm.tqdm(total=steps - initial_step)
+    try:
+        for i in range((steps-initial_step) * gradient_step):
+            if scheduler.finished:
+                break
+            if shared.state.interrupted:
+                break
+            for j, batch in enumerate(dl):
+                # works as a drop_last=True for gradient accumulation
+                if j == max_steps_per_epoch:
+                    break
+                scheduler.apply(optimizer, embedding.step)
+                if scheduler.finished:
+                    break
+                if shared.state.interrupted:
+                    break
+
+                if clip_grad:
+                    clip_grad_sched.step(embedding.step)
            
+                with devices.autocast():
+                    x = batch.latent_sample.to(devices.device, non_blocking=pin_memory)
+                    c = shared.sd_model.cond_stage_model(batch.cond_text)

-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
+                    if is_training_inpainting_model:
+                        if img_c is None:
+                            img_c = processing.txt2img_image_conditioning(shared.sd_model, c, training_width, training_height)

-        epoch_num = embedding.step // len(ds)
-        epoch_step = embedding.step - (epoch_num * len(ds)) + 1
+                        cond = {"c_concat": [img_c], "c_crossattn": [c]}
+                    else:
+                        cond = c

-        pbar.set_description(f"[Epoch {epoch_num}: {epoch_step}/{len(ds)}]loss: {losses.mean():.7f}")
+                    loss = shared.sd_model(x, cond)[0] / gradient_step
+                    del x

-        if embedding.step > 0 and embedding_dir is not None and embedding.step % save_embedding_every == 0:
-            last_saved_file = os.path.join(embedding_dir, f'{embedding_name}-{embedding.step}.pt')
-            embedding.save(last_saved_file)
-            embedding_yet_to_be_embedded = True
+                    _loss_step += loss.item()
+                scaler.scale(loss).backward()

-        if shared.opts.training_enable_tensorboard:
-            tensorboard_add(tensorboard_writer, loss=losses.mean(), global_step=embedding.step, 
-                step=epoch_step, learn_rate=scheduler.learn_rate, epoch_num=epoch_num)
+                # go back until we reach gradient accumulation steps
+                if (j + 1) % gradient_step != 0:
+                    continue
+                
+                if clip_grad:
+                    clip_grad(embedding.vec, clip_grad_sched.learn_rate)

-        write_loss(log_directory, "textual_inversion_loss.csv", embedding.step, len(ds), {
-            "loss": f"{losses.mean():.7f}",
-            "learn_rate": scheduler.learn_rate
-        })
+                scaler.step(optimizer)
+                scaler.update()
+                embedding.step += 1
+                pbar.update()
+                optimizer.zero_grad(set_to_none=True)
+                loss_step = _loss_step
+                _loss_step = 0

-        if embedding.step > 0 and images_dir is not None and embedding.step % create_image_every == 0:
-            last_saved_image = os.path.join(images_dir, f'{embedding_name}-{embedding.step}.png')
+                steps_done = embedding.step + 1

-            p = processing.StableDiffusionProcessingTxt2Img(
-                sd_model=shared.sd_model,
-                do_not_save_grid=True,
-                do_not_save_samples=True,
-                do_not_reload_embeddings=True,
-            )
+                epoch_num = embedding.step // steps_per_epoch
+                epoch_step = embedding.step % steps_per_epoch

-            if preview_from_txt2img:
-                p.prompt = preview_prompt
-                p.negative_prompt = preview_negative_prompt
-                p.steps = preview_steps
-                p.sampler_index = preview_sampler_index
-                p.cfg_scale = preview_cfg_scale
-                p.seed = preview_seed
-                p.width = preview_width
-                p.height = preview_height
-            else:
-                p.prompt = entries[0].cond_text
-                p.steps = 20
-                p.width = training_width
-                p.height = training_height
+                description = f"Training textual inversion [Epoch {epoch_num}: {epoch_step+1}/{steps_per_epoch}] loss: {loss_step:.7f}"
+                pbar.set_description(description)
+                shared.state.textinfo = description
+                if embedding_dir is not None and steps_done % save_embedding_every == 0:
+                    # Before saving, change name to match current checkpoint.
+                    embedding_name_every = f'{embedding_name}-{steps_done}'
+                    last_saved_file = os.path.join(embedding_dir, f'{embedding_name_every}.pt')
+                    save_embedding(embedding, optimizer, checkpoint, embedding_name_every, last_saved_file, remove_cached_checksum=True)
+                    embedding_yet_to_be_embedded = True

-            preview_text = p.prompt
+                write_loss(log_directory, "textual_inversion_loss.csv", embedding.step, steps_per_epoch, {
+                    "loss": f"{loss_step:.7f}",
+                    "learn_rate": scheduler.learn_rate
+                })

-            processed = processing.process_images(p)
-            image = processed.images[0]
+                if images_dir is not None and steps_done % create_image_every == 0:
+                    forced_filename = f'{embedding_name}-{steps_done}'
+                    last_saved_image = os.path.join(images_dir, forced_filename)

-            shared.state.current_image = image
+                    shared.sd_model.first_stage_model.to(devices.device)

-            if save_image_with_stored_embedding and os.path.exists(last_saved_file) and embedding_yet_to_be_embedded:
+                    p = processing.StableDiffusionProcessingTxt2Img(
+                        sd_model=shared.sd_model,
+                        do_not_save_grid=True,
+                        do_not_save_samples=True,
+                        do_not_reload_embeddings=True,
+                    )

-                last_saved_image_chunks = os.path.join(images_embeds_dir, f'{embedding_name}-{embedding.step}.png')
+                    if preview_from_txt2img:
+                        p.prompt = preview_prompt
+                        p.negative_prompt = preview_negative_prompt
+                        p.steps = preview_steps
+                        p.sampler_name = sd_samplers.samplers[preview_sampler_index].name
+                        p.cfg_scale = preview_cfg_scale
+                        p.seed = preview_seed
+                        p.width = preview_width
+                        p.height = preview_height
+                    else:
+                        p.prompt = batch.cond_text[0]
+                        p.steps = 20
+                        p.width = training_width
+                        p.height = training_height

-                info = PngImagePlugin.PngInfo()
-                data = torch.load(last_saved_file)
-                info.add_text("sd-ti-embedding", embedding_to_b64(data))
+                    preview_text = p.prompt

-                title = "<{}>".format(data.get('name', '???'))
+                    processed = processing.process_images(p)
+                    image = processed.images[0] if len(processed.images) > 0 else None

-                try:
-                    vectorSize = list(data['string_to_param'].values())[0].shape[0]
-                except Exception as e:
-                    vectorSize = '?'
+                    if unload:
+                        shared.sd_model.first_stage_model.to(devices.cpu)

-                checkpoint = sd_models.select_checkpoint()
-                footer_left = checkpoint.model_name
-                footer_mid = '[{}]'.format(checkpoint.hash)
-                footer_right = '{}v {}s'.format(vectorSize, embedding.step)
+                    if image is not None:
+                        shared.state.current_image = image
+                        last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
+                        last_saved_image += f", prompt: {preview_text}"

-                captioned_image = caption_image_overlay(image, title, footer_left, footer_mid, footer_right)
-                captioned_image = insert_image_data_embed(captioned_image, data)
+                        if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images:
+                            tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", image, embedding.step)

-                captioned_image.save(last_saved_image_chunks, "PNG", pnginfo=info)
-                embedding_yet_to_be_embedded = False
+                    if save_image_with_stored_embedding and os.path.exists(last_saved_file) and embedding_yet_to_be_embedded:

-            image.save(last_saved_image)
+                        last_saved_image_chunks = os.path.join(images_embeds_dir, f'{embedding_name}-{steps_done}.png')

-            if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images:
-                tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", 
-                    image, embedding.step)
+                        info = PngImagePlugin.PngInfo()
+                        data = torch.load(last_saved_file)
+                        info.add_text("sd-ti-embedding", embedding_to_b64(data))

-            last_saved_image += f", prompt: {preview_text}"
+                        title = "<{}>".format(data.get('name', '???'))

-        shared.state.job_no = embedding.step
+                        try:
+                            vectorSize = list(data['string_to_param'].values())[0].shape[0]
+                        except Exception as e:
+                            vectorSize = '?'

-        shared.state.textinfo = f"""
+                        checkpoint = sd_models.select_checkpoint()
+                        footer_left = checkpoint.model_name
+                        footer_mid = '[{}]'.format(checkpoint.hash)
+                        footer_right = '{}v {}s'.format(vectorSize, steps_done)
+
+                        captioned_image = caption_image_overlay(image, title, footer_left, footer_mid, footer_right)
+                        captioned_image = insert_image_data_embed(captioned_image, data)
+
+                        captioned_image.save(last_saved_image_chunks, "PNG", pnginfo=info)
+                        embedding_yet_to_be_embedded = False
+
+                    last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False)
+                    last_saved_image += f", prompt: {preview_text}"
+
+                shared.state.job_no = embedding.step
+
+                shared.state.textinfo = f"""
 <p>
-Loss: {losses.mean():.7f}<br/>
-Step: {embedding.step}<br/>
-Last prompt: {html.escape(entries[0].cond_text)}<br/>
+Loss: {loss_step:.7f}<br/>
+Step: {steps_done}<br/>
+Last prompt: {html.escape(batch.cond_text[0])}<br/>
 Last saved embedding: {html.escape(last_saved_file)}<br/>
 Last saved image: {html.escape(last_saved_image)}<br/>
 </p>
 """
-
-    checkpoint = sd_models.select_checkpoint()
-
-    embedding.sd_checkpoint = checkpoint.hash
-    embedding.sd_checkpoint_name = checkpoint.model_name
-    embedding.cached_checksum = None
-    embedding.save(filename)
+        filename = os.path.join(shared.cmd_opts.embeddings_dir, f'{embedding_name}.pt')
+        save_embedding(embedding, optimizer, checkpoint, embedding_name, filename, remove_cached_checksum=True)
+    except Exception:
+        print(traceback.format_exc(), file=sys.stderr)
+        pass
+    finally:
+        pbar.leave = False
+        pbar.close()
+        shared.sd_model.first_stage_model.to(devices.device)
+        shared.parallel_processing_allowed = old_parallel_processing_allowed

    return embedding, filename
+
+def save_embedding(embedding, optimizer, checkpoint, embedding_name, filename, remove_cached_checksum=True):
+    old_embedding_name = embedding.name
+    old_sd_checkpoint = embedding.sd_checkpoint if hasattr(embedding, "sd_checkpoint") else None
+    old_sd_checkpoint_name = embedding.sd_checkpoint_name if hasattr(embedding, "sd_checkpoint_name") else None
+    old_cached_checksum = embedding.cached_checksum if hasattr(embedding, "cached_checksum") else None
+    try:
+        embedding.sd_checkpoint = checkpoint.hash
+        embedding.sd_checkpoint_name = checkpoint.model_name
+        if remove_cached_checksum:
+            embedding.cached_checksum = None
+        embedding.name = embedding_name
+        embedding.optimizer_state_dict = optimizer.state_dict()
+        embedding.save(filename)
+    except:
+        embedding.sd_checkpoint = old_sd_checkpoint
+        embedding.sd_checkpoint_name = old_sd_checkpoint_name
+        embedding.name = old_embedding_name
+        embedding.cached_checksum = old_cached_checksum
+        raise
--- a/modules/textual_inversion/ui.py
+++ b/modules/textual_inversion/ui.py
@@ -7,8 +7,8 @@ import modules.textual_inversion.preprocess
 from modules import sd_hijack, shared


-def create_embedding(name, initialization_text, nvpt):
-    filename = modules.textual_inversion.textual_inversion.create_embedding(name, nvpt, init_text=initialization_text)
+def create_embedding(name, initialization_text, nvpt, overwrite_old):
+    filename = modules.textual_inversion.textual_inversion.create_embedding(name, nvpt, overwrite_old, init_text=initialization_text)

    sd_hijack.model_hijack.embedding_db.load_textual_inversion_embeddings()

@@ -18,15 +18,17 @@ def create_embedding(name, initialization_text, nvpt):
 def preprocess(*args):
    modules.textual_inversion.preprocess.preprocess(*args)

-    return "Preprocessing finished.", ""
+    return f"Preprocessing {'interrupted' if shared.state.interrupted else 'finished'}.", ""


 def train_embedding(*args):

    assert not shared.cmd_opts.lowvram, 'Training models with lowvram not possible'

+    apply_optimizations = shared.opts.training_xattention_optimizations
    try:
-        sd_hijack.undo_optimizations()
+        if not apply_optimizations:
+            sd_hijack.undo_optimizations()

        embedding, filename = modules.textual_inversion.textual_inversion.train_embedding(*args)

@@ -38,5 +40,6 @@ Embedding saved to {html.escape(filename)}
    except Exception:
        raise
    finally:
-        sd_hijack.apply_optimizations()
+        if not apply_optimizations:
+            sd_hijack.apply_optimizations()