(cache)flux-train-qinglong/train_flux_24Glora.ps1 at master · 2575044704/flux-train-qinglong · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
# LoRA train script by @Akegarasu modify by @bdsqlsz

#训练模式(Lora、db、sdxl_lora、Sdxl_db、sdxl_cn3l、stable_cascade_db、stable_cascade_lora、controlnet、hunyuan_lora、hunyuan_db、sd3_db、flux_lora、flux_db)
$train_mode = "flux_lora"

# Train data path | 设置训练用模型、图片
$pretrained_model = "./Stable-diffusion/flux/flux1-dev.safetensors" # base model path | 底模路径
$vae = "./VAE/ae.sft"
$is_v2_model = 0 # SD2.0 model | SD2.0模型 2.0模型下 clip_skip 默认无效
$v_parameterization = 1 # parameterization | 参数化 v2 非512基础分辨率版本必须使用。
$train_data_dir = "./train/qinglong/train" # train dataset path | 训练数据集路径
$reg_data_dir = ""	# reg dataset path | 正则数据集化路径
$network_weights = "" # pretrained weights for LoRA network | 若需要从已有的 LoRA 模型上继续训练，请填写 LoRA 模型路径。
$network_multiplier = 1.0 # lora权重倍数，默认1.0
$training_comment = "this LoRA model created from bdsqlsz by bdsqlsz'script" # training_comment | 训练介绍，可以写作者名或者使用触发关键词
$dataset_class = ""
$dataset_config = "" # dataset config | 数据集配置文件路径
$disable_mmap_load_safetensors = 0 #在wsl下加载模型速度增加

#stable_cascade 训练相关参数
$effnet_checkpoint_path = "./VAE/effnet_encoder.safetensors" #effnet，相当于轻量化的VAE
$stage_c_checkpoint_path = "./Stable-diffusion/train/stage_c_bf16.safetensors" #stage_c，相当于base_model
$text_model_checkpoint_path = "" #te文本编码器，第一次默认不设置则自动从HF下载
$save_text_model = 1 #0关闭1开启，第一次训练设置保存TE的位置，之后不需要使用，只需要通过前面的参数text_model_checkpoint_path读取机壳
$previewer_checkpoint_path = "./Stable-diffusion/train/previewer.safetensors" #预览模型，开启预览图的话需要使用。
$adaptive_loss_weight = 1 #0关闭1开启，使用adaptive_loss_weight，官方推荐。关闭则使用P2LOSSWIGHT

#SD3 训练相关参数
$clip_l = "./clip/clip_l.safetensors"
$clip_g = "./clip/clip_g.safetensors"
$t5xxl = "./clip/t5xxl_fp16.safetensors"
$t5xxl_device = "" #默认cuda，显存不够可改为CPU，但是很慢
$t5xxl_dtype = "fp32" #目前支持fp32、fp16、bf16
$text_encoder_batch_size = 12
$num_last_block_to_freeze = 0
$discrete_flow_shift = 1.15 # Euler 离散调度器的离散流位移，sd3默认为3.0
$apply_t5_attn_mask = 1 # 是否应用T5的注意力掩码，默认为0

#flux 相关参数
$ae = $vae
$timestep_sampling = "shift" # 时间步采样方法，可选 sd3用"sigma"、普通DDPM用"uniform" 或 flux用"sigmoid" 或者 "shift". shift需要修改discarete_flow_shift的参数
$sigmoid_scale = 1.0 # sigmoid 采样的缩放因子，默认为 1.0。较大的值会使采样更加均匀
$model_prediction_type = "raw" # 模型预测类型，可选 flux的"raw"、增加噪声输入"additive" 或 sd选"sigma_scaled"
$guidance_scale = 1.0 # guidance scale，就是CFG, 默认为 1.0
$blockwise_fused_optimizers = 1 # 是否使用块级融合优化器，默认为1
$double_blocks_to_swap = 6 # 交换的块数，默认为6
$single_blocks_to_swap = 0 # 交换的块数，默认为0
$cpu_offload_checkpointing = 1 # 是否使用CPU卸载checkpoint，finetune默认开启
$mem_eff_save = 1 # 是否使用内存高效保存，默认为1
$split_qkv=1 # 是否分离QKV，默认为1

#差异炼丹法
$base_weights = "" #指定合并到底模basemodel中的模型路径，多个用空格隔开。默认为空，不使用。
$base_weights_multiplier = "1.0" #指定合并模型的权重，多个用空格隔开，默认为1.0。

# Train related params | 训练相关参数
$resolution = "1024,1024" # image resolution w,h. 图片分辨率，宽,高。支持非正方形，但必须是 64 倍数。
$batch_size = 2 # batch size 一次性训练图片批处理数量，根据显卡质量对应调高。
$max_train_epoches = 18 # max train epoches | 最大训练 epoch
$save_every_n_epochs = 4 # save every n epochs | 每 N 个 epoch 保存一次

$gradient_checkpointing = 1 #梯度检查，开启后可节约显存，但是速度变慢
$gradient_accumulation_steps = 1 # 梯度累加数量，变相放大batchsize的倍数
$optimizer_accumulation_steps = 0

$network_dim = 32 # network dim | 常用 4~128，不是越大越好
$network_alpha = 32 # network alpha | 常用与 network_dim 相同的值或者采用较小的值，如 network_dim的一半 防止下溢。默认值为 1，使用较小的 alpha 需要提升学习率。

$train_unet_only = 1 # train U-Net only | 仅训练 U-Net，开启这个会牺牲效果大幅减少显存使用。6G显存可以开启
$train_text_encoder_only = 0 # train Text Encoder only | 仅训练 文本编码器

$seed = 1026 # reproducable seed | 设置跑测试用的种子，输入一个prompt和这个种子大概率得到训练图。可以用来试触发关键词

#LORA_PLUS
$enable_lora_plus = 0
$loraplus_lr_ratio = 16
$loraplus_unet_lr_ratio = 16
$loraplus_text_encoder_lr_ratio = 4

#dropout | 抛出(目前和lycoris不兼容，请使用lycoris自带dropout)
$network_dropout = 0 # dropout 是机器学习中防止神经网络过拟合的技术，建议0.1~0.3
$scale_weight_norms = 1.0 #配合 dropout 使用，最大范数约束，推荐1.0
$rank_dropout = 0 #lora模型独创，rank级别的dropout，推荐0.1~0.3，未测试过多
$module_dropout = 0 #lora模型独创，module级别的dropout(就是分层模块的)，推荐0.1~0.3，未测试过多
$caption_dropout_every_n_epochs = 0 #dropout caption
$caption_dropout_rate = 0 #0~1
$caption_tag_dropout_rate = 0 #0~1

#noise | 噪声
$noise_offset = 0 # help allow SD to gen better blacks and whites，(0-1) | 帮助SD更好分辨黑白，推荐概念0.06，画风0.1
$adaptive_noise_scale = 0 #自适应偏移调整，10%~100%的noiseoffset大小
$noise_offset_random_strength = 0 #噪声随机强度
$multires_noise_iterations = 0 #多分辨率噪声扩散次数，推荐6-10,0禁用。
$multires_noise_discount = 0 #多分辨率噪声缩放倍数，推荐0.1-0.3,上面关掉的话禁用。
$min_snr_gamma = 0 #最小信噪比伽马值，减少低step时loss值，让学习效果更好。推荐3-5，5对原模型几乎没有太多影响，3会改变最终结果。修改为0禁用。
$ip_noise_gamma = 0 #误差噪声添加，防止误差累计
$ip_noise_gamma_random_strength = 0 #误差噪声随机强度
$debiased_estimation_loss = 0 #信噪比噪声修正，minsnr高级版
$loss_type = "l2" #损失函数类型，`smooth_l1`、`huber`、`l2`(就是MSE)
$huber_schedule = "snr" #huber调度器，可选 `exponential`、`constant` 或 `snr`
$huber_c = 0.1 #huber损失函数的c参数
$immiscible_noise = 0 #是否开启混合噪声


# Learning rate | 学习率
$lr = "1e-5"
$unet_lr = "8e-4"
$text_encoder_lr = "2e-5"
$lr_scheduler = "cosine_with_min_lr"
# "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup" | PyTorch自带6种动态学习率函数
# constant，常量不变, constant_with_warmup 线性增加后保持常量不变, linear 线性增加线性减少, polynomial 线性增加后平滑衰减, cosine 余弦波曲线, cosine_with_restarts 余弦波硬重启，瞬间最大值。
# 新增cosine_with_min_lr(适合训练lora)、warmup_stable_decay(适合训练db)、inverse_sqrt
$lr_warmup_steps = 24 # warmup steps | 学习率预热步数，lr_scheduler 为 constant 或 adafactor 时该值需要设为0。仅在 lr_scheduler 为 constant_with_warmup 时需要填写这个值
$lr_decay_steps = 48 # decay steps | 学习率衰减步数，仅在 lr_scheduler 为warmup_stable_decay时 需要填写，一般是10%总步数
$lr_scheduler_num_cycles = 1 # restarts nums | 余弦退火重启次数，仅在 lr_scheduler 为 cosine_with_restarts 时需要填写这个值
$lr_scheduler_timescale = 0 #times scale |时间缩放，仅在 lr_scheduler 为 inverse_sqrt 时需要填写这个值，默认同lr_warmup_steps
$lr_scheduler_min_lr_ratio = 0.1 #min lr ratio |最小学习率比率，仅在 lr_scheduler 为 cosine_with_min_lr、、warmup_stable_decay 时需要填写这个值，默认0

#optimizer | 优化器
$optimizer_type = "PagedAdamW8bit"
# 可选优化器"adaFactor","AdamW","AdamW8bit","Lion","SGDNesterov","SGDNesterov8bit","DAdaptation",
# 新增优化器"Lion8bit"(速度更快，内存消耗更少)、"DAdaptAdaGrad"、"DAdaptAdan"(北大最新算法，效果待测)、"DAdaptSGD"
# 新增DAdaptAdam、DAdaptLion、DAdaptAdanIP，强烈推荐DAdaptAdam
# 新增优化器"Sophia"(2倍速1.7倍显存)、"Prodigy"天才优化器，可自适应Dylora
# PagedAdamW8bit、PagedLion8bit、Adan、Tiger
# AdamWScheduleFree、SGDScheduleFree
# StableAdamW、Ranger
$d_coef = "0.5" #prodigy D上升速度
$d0 = "1e-4" #dadaptation以及prodigy初始学习率
$fused_backward_pass = 0 #训练大模型float32精度专用节约显存，必须优化器adafactor或者adamw，gradient_accumulation_steps必须为1或者不开。
$fused_optimizer_groups = 0

#gorkfast | 快速拟合
$gradfilter_ema_alpha = 0 #EMA的动量超参数 设置ema_alpha来激活gradfilter_ema，推荐0.98，为0则关闭
$gradfilter_ema_lamb = 2.0 #滤波器ema的放大因子超参数。

# 数据集处理 打标captain相关
$shuffle_caption = 1 # 随机打乱tokens
$keep_tokens = 1 # keep heading N tokens when shuffling caption tokens | 在随机打乱 tokens 时，保留前 N 个不变。
$prior_loss_weight = 1 #正则化权重,0-1
$weighted_captions = 0 #权重打标，默认识别标签权重，语法同webui基础用法。例如(abc), [abc],(abc:1.23),但是不能在括号内加逗号，否则无法识别。一个文件最多75个tokens。
$secondary_separator = ";;;" #次要分隔符。被该分隔符分隔的部分将被视为一个token，并被洗牌和丢弃。然后由 caption_separator 取代。例如，如果指定 aaa;;bbb;;cc，它将被 aaa,bbb,cc 取代或一起丢弃。
$keep_tokens_separator = "|||" #批量保留不变，间隔符号
$enable_wildcard = 0 #通配符随机抽卡，格式参考 {aaa|bbb|ccc}
$caption_prefix = "" #打标前缀，可以加入质量词如果底模需要，例如masterpiece, best quality,
$caption_suffix = "" #打标后缀，可以加入相机镜头如果需要，例如full body等
$alpha_mask = 0 #是否使用透明蒙版检测

# Output settings | 输出设置
$output_name = "flux-test-loraqkv" # output model name | 模型保存名称
$save_model_as = "safetensors" # model save ext | 模型保存格式 ckpt, pt, safetensors
$mixed_precision = "bf16" # 默认fp16,no,bf16可选
$save_precision = "bf16" # 默认fp16,fp32,bf16可选
$full_fp16 = 0 #开启全fp16模式，自动混合精度变为fp16，更节约显存
$full_bf16 = 0 #选择全bf16训练，必须30系以上显卡。
$fp8_base = 1 #开启fp8模式，更节约显存，实验性功能

# Resume training state | 恢复训练设置
$save_state = 0 # save training state | 保存训练状态 名称类似于 <output_name>-??????-state ?????? 表示 epoch 数
$resume = "" # resume from state | 从某个状态文件夹中恢复训练 需配合上方参数同时使用 由于规范文件限制 epoch 数和全局步数不会保存 即使恢复时它们也从 1 开始 与 network_weights 的具体实现操作并不一致
$save_state_on_train_end = 0 #只在训练结束最后保存训练状态

#保存toml文件
$output_config = 0 #开启后直接输出一个toml配置文件，但是无法同时训练，需要关闭才能正常训练。
$config_file = "./toml/" + $output_name + ".toml" #输出文件保存目录和文件名称，默认用模型保存同名。

#输出采样图片
$enable_sample = 1 #1开启出图，0禁用
$sample_at_first = 0 #是否在训练开始时就出图
$sample_every_n_epochs = 3 #每n个epoch出一次图
$sample_prompts = "./toml/qinglong.txt" #prompt文件路径
$sample_sampler = "euler_a" #采样器 'ddim', 'pndm', 'heun', 'dpmsolver', 'dpmsolver++', 'dpmsingle', 'k_lms', 'k_euler', 'k_euler_a', 'k_dpm_2', 'k_dpm_2_a'

#wandb 日志同步
$wandb_api_key = "" # wandbAPI KEY，用于登录

# 其他设置
$enable_bucket = 1 #开启分桶
$min_bucket_reso = 256 # arb min resolution | arb 最小分辨率
$max_bucket_reso = 2048 # arb max resolution | arb 最大分辨率
$bucket_no_upscale = 1 #分桶不放大
$persistent_workers = 1 # makes workers persistent, further reduces/eliminates the lag in between epochs. however it may increase memory usage | 跑的更快，吃内存。大概能提速2倍
$vae_batch_size = 4 #vae批处理大小，2-4
$clip_skip = 2 # clip skip | 玄学 一般用 2
$cache_latents = 1 #缓存潜变量
$cache_latents_to_disk = 1 # 缓存图片存盘，下次训练不需要重新缓存，1开启0禁用
$torch_compile = 0 #使用torch编译功能，需要版本大于2.1
$dynamo_backend = "inductor" #"eager", "aot_eager", "inductor","aot_ts_nvfuser","nvprims_nvfuser","cudagraphs","aot_torchxla_trace_once"用于训练
$TORCHINDUCTOR_FX_GRAPH_CACHE = 1 #启用本地 FX 图缓存。
$TORCHINDUCTOR_CACHE_DIR = "./torch_compile_cache" #指定所有磁盘缓存的位置。

#lycoris组件
$enable_lycoris = 0 # 开启lycoris
$conv_dim = 0 #卷积 dim，推荐＜32
$conv_alpha = 0 #卷积 alpha，推荐1或者0.3
$algo = "lokr" # algo参数，指定训练lycoris模型种类，
#包括lora(就是locon)、
#loha
#IA3
#lokr
#dylora
#full(DreamBooth先训练然后导出lora)
#diag-oft
#它通过训练适用于各层输出的正交变换来保留超球面能量。
#根据原始论文，它的收敛速度比 LoRA 更快，但仍需进行实验。
#dim 与区块大小相对应：我们在这里固定了区块大小而不是区块数量，以使其与 LoRA 更具可比性。

$dropout = 0 #lycoris专用dropout
$preset = "attn-mlp" #预设训练模块配置
#full: default preset, train all the layers in the UNet and CLIP|默认设置，训练所有Unet和Clip层
#full-lin: full but skip convolutional layers|跳过卷积层
#attn-mlp: train all the transformer block.|kohya配置，训练所有transformer模块
#attn-only：only attention layer will be trained, lot of papers only do training on attn layer.|只有注意力层会被训练，很多论文只对注意力层进行训练。
#unet-transformer-only： as same as kohya_ss/sd_scripts with disabled TE, or, attn-mlp preset with train_unet_only enabled.|和attn-mlp类似，但是关闭te训练
#unet-convblock-only： only ResBlock, UpSample, DownSample will be trained.|只训练卷积模块，包括res、上下采样模块
#./toml/example_lycoris.toml: 也可以直接使用外置配置文件，制定各个层和模块使用不同算法训练，需要输入位置文件路径，参考样例已添加。

$factor = 8 #只适用于lokr的因子，-1~8，8为全维度
$decompose_both = 1 #适用于lokr的参数，对 LoKr 分解产生的两个矩阵执行 LoRA 分解（默认情况下只分解较大的矩阵）
$block_size = 4 #适用于dylora,分割块数单位，最小1也最慢。一般4、8、12、16这几个选
$use_tucker = 0 #适用于除 (IA)^3 和full
$use_scalar = 0 #根据不同算法，自动调整初始权重
$train_norm = 0 #归一化层
$dora_wd = 0 #Dora方法分解，低rank使用。适用于LoRA, LoHa, 和LoKr
$full_matrix = 1  #全矩阵分解
$bypass_mode = 0 #通道模式，专为 bnb 8 位/4 位线性层设计。(QLyCORIS)适用于LoRA, LoHa, 和LoKr
$rescaled = 1 #适用于设置缩放，效果等同于OFT
$constrain = 0 #设置值为FLOAT，效果等同于COFT

#dylora组件
$enable_dylora = 0 # 开启dylora，和lycoris冲突，只能开一个。
$unit = 4 #分割块数单位，最小1也最慢。一般4、8、12、16这几个选

#Lora_FA
$enable_lora_fa = 0 # 开启lora_fa，和lycoris、dylora冲突，只能开一个。

#oft
$enable_oft = 0 # 开启oft，和以上冲突，只能开一个。

# block weights | 分层训练
$enable_block_weights = 0 #开启分层训练，和lycoris冲突，只能开一个。
$down_lr_weight = "1,0.2,1,1,0.2,1,1,0.2,1,1,1,1" #12层，需要填写12个数字，0-1.也可以使用函数写法，支持sine, cosine, linear, reverse_linear, zeros，参考写法down_lr_weight=cosine+.25
$mid_lr_weight = "1"  #1层，需要填写1个数字，其他同上。
$up_lr_weight = "1,1,1,1,1,1,1,1,1,1,1,1"   #12层，同上上。
$block_lr_zero_threshold = 0  #如果分层权重不超过这个值，那么直接不训练。默认0。

$enable_block_dim = 0 #开启dim分层训练
$block_dims = "128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128" #dim分层，25层
$block_alphas = "16,16,32,16,32,32,64,16,16,64,64,64,16,64,16,64,32,16,16,64,16,16,16,64,16"  #alpha分层，25层
$conv_block_dims = "32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32" #convdim分层，25层
$conv_block_alphas = "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1" #convalpha分层，25层

# block lr
$enable_block_lr = 0
$block_lr = "0,$lr,$lr,0,$lr,$lr,0,$lr,$lr,0,$lr,$lr,$lr,$lr,$lr,$lr,$lr,$lr,$lr,$lr,$lr,$lr,0"

#SDXL专用参数
#https://www.bilibili.com/video/BV1tk4y137fo/
$min_timestep = 0 #最小时序，默认值0
$max_timestep = 1000 #最大时序，默认值1000
$cache_text_encoder_outputs = 1 #开启缓存文本编码器，开启后减少显存使用。但是无法和shuffle共用
$cache_text_encoder_outputs_to_disk = 1 #开启缓存文本编码器，开启后减少显存使用。但是无法和shuffle共用
$no_half_vae = 0 #禁止半精度，防止黑图。无法和mixed_precision混合精度共用。
$bucket_reso_steps = 32 #SDXL分桶可以选择32或者64。32更精细分桶。默认为64

#db checkpoint train
$stop_text_encoder_training = 0
$no_token_padding = 0 #不进行分词器填充

#sdxl_db
$diffusers_xformers = 0
$train_text_encoder = 0
$learning_rate_te1 = "5e-8"
$learning_rate_te2 = "5e-8"


if ($mixed_precision) {
  [void]$ext_args.Add("--mixed_precision=$mixed_precision")
}

if ($network_module) {
  [void]$ext_args.Add("--network_module=$network_module")
}

if ($gradient_accumulation_steps) {
  [void]$ext_args.Add("--gradient_accumulation_steps=$gradient_accumulation_steps")
}

if ($optimizer_accumulation_steps) {
  [void]$ext_args.Add("--optimizer_accumulation_steps=$optimizer_accumulation_steps")
}

if ($lr_scheduler) {
  [void]$ext_args.Add("--lr_scheduler=$lr_scheduler")
}

if ($lr_scheduler_num_cycles) {
  [void]$ext_args.Add("--lr_scheduler_num_cycles=$lr_scheduler_num_cycles")
}

if ($lr_warmup_steps) {
  if ($gradient_accumulation_steps) {
    $lr_warmup_steps = $lr_warmup_steps * $gradient_accumulation_steps
  }
  [void]$ext_args.Add("--lr_warmup_steps=$lr_warmup_steps")
}

if ($lr_decay_steps) {
  if ($gradient_accumulation_steps) {
    $lr_decay_steps = $lr_decay_steps * $gradient_accumulation_steps
  }
  [void]$ext_args.Add("--lr_decay_steps=$lr_decay_steps")
}

if ($lr_scheduler_timescale) {
  [void]$ext_args.Add("--lr_scheduler_timescale=$lr_scheduler_timescale")
}

if ($lr_scheduler_min_lr_ratio) {
  [void]$ext_args.Add("--lr_scheduler_min_lr_ratio=$lr_scheduler_min_lr_ratio")
}

if ($caption_dropout_every_n_epochs) {
  [void]$ext_args.Add("--caption_dropout_every_n_epochs=$caption_dropout_every_n_epochs")
}
if ($caption_dropout_rate) {
  [void]$ext_args.Add("--caption_dropout_rate=$caption_dropout_rate")
}
if ($caption_tag_dropout_rate) {
  [void]$ext_args.Add("--caption_tag_dropout_rate=$caption_tag_dropout_rate")
}

# run train
python -m accelerate.commands.launch --num_cpu_threads_per_process=8 $launch_args "./sd-scripts/$laungh_script.py" `
  --output_dir="./output" `
  --logging_dir="./logs" `
  --max_train_epochs=$max_train_epoches `
  --learning_rate=$lr `
  --output_name=$output_name `
  --save_every_n_epochs=$save_every_n_epochs `
  --save_precision=$save_precision `
  --seed=$seed  `
  --max_token_length=225 `
  --caption_extension=".txt" `
  --vae_batch_size=$vae_batch_size `
  $ext_args

Write-Output "Train finished"
Read-Host | Out-Null ;