diff --git a/cv/Self-Supervised Learning/MAE/pytorch/.gitignore b/cv/Self-Supervised Learning/MAE/pytorch/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5b3f20f4900966f5caab5891b8fb7240a86dd26a --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/.gitignore @@ -0,0 +1,8 @@ +*.swp +**/__pycache__/** +imnet_resnet50_scratch/timm_temp/ +.dumbo.json +checkpoints/ +.idea/* +**/.ipynb_checkpoints/** +output/ \ No newline at end of file diff --git a/cv/Self-Supervised Learning/MAE/pytorch/README.md b/cv/Self-Supervised Learning/MAE/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..27e6bae3305b0977a2bb8964ce2ecb26ed1b77dd --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/README.md @@ -0,0 +1,43 @@ +# MAE-pytorch + +## Model description +This repository is built upon BEiT, an unofficial PyTorch implementation of Masked Autoencoders Are Scalable Vision Learners. We implement the pretrain and finetune process according to the paper, but still can't guarantee the performance reported in the paper can be reproduced! + +## Environment + +``` +cd MAE-pytorch +pip3 install -r requirements.txt +mkdir -p /home/datasets/cv/ImageNet_ILSVRC2012 +mkdir -p pretrain +mkdir -p output +``` + +## Download dataset + +``` +cd /home/datasets/cv/ImageNet_ILSVRC2012 +Download the [ImageNet Dataset](https://www.image-net.org/download.php) +``` + +## Download pretrain weight + +``` +cd pretrain +Download the [pretrain_mae_vit_base_mask_0.75_400e.pth](https://drive.google.com/drive/folders/182F5SLwJnGVngkzguTelja4PztYLTXfa) +``` + +## Finetune + +``` +bash run.sh +``` + +## Results on BI-V100 + +``` +| GPUs | FPS | Train Epochs | Accuracy | +|------|-----|--------------|------| +| 1x8 | 1233 | 100 | 82.9% | +``` + diff --git a/cv/Self-Supervised Learning/MAE/pytorch/dataset_folder.py b/cv/Self-Supervised Learning/MAE/pytorch/dataset_folder.py new file mode 100644 index 0000000000000000000000000000000000000000..c3c406dd9b6108ee963411bdcf980ac3ce309e11 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/dataset_folder.py @@ -0,0 +1,245 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +from torchvision.datasets.vision import VisionDataset + +from PIL import Image + +import os +import os.path +import random +from typing import Any, Callable, cast, Dict, List, Optional, Tuple + + +def has_file_allowed_extension(filename: str, extensions: Tuple[str, ...]) -> bool: + """Checks if a file is an allowed extension. + + Args: + filename (string): path to a file + extensions (tuple of strings): extensions to consider (lowercase) + + Returns: + bool: True if the filename ends with one of given extensions + """ + return filename.lower().endswith(extensions) + + +def is_image_file(filename: str) -> bool: + """Checks if a file is an allowed image extension. + + Args: + filename (string): path to a file + + Returns: + bool: True if the filename ends with a known image extension + """ + return has_file_allowed_extension(filename, IMG_EXTENSIONS) + + +def make_dataset( + directory: str, + class_to_idx: Dict[str, int], + extensions: Optional[Tuple[str, ...]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, +) -> List[Tuple[str, int]]: + instances = [] + directory = os.path.expanduser(directory) + both_none = extensions is None and is_valid_file is None + both_something = extensions is not None and is_valid_file is not None + if both_none or both_something: + raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time") + if extensions is not None: + def is_valid_file(x: str) -> bool: + return has_file_allowed_extension(x, cast(Tuple[str, ...], extensions)) + is_valid_file = cast(Callable[[str], bool], is_valid_file) + for target_class in sorted(class_to_idx.keys()): + class_index = class_to_idx[target_class] + target_dir = os.path.join(directory, target_class) + if not os.path.isdir(target_dir): + continue + for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)): + for fname in sorted(fnames): + path = os.path.join(root, fname) + if is_valid_file(path): + item = path, class_index + instances.append(item) + return instances + + +class DatasetFolder(VisionDataset): + """A generic data loader where the samples are arranged in this way: :: + + root/class_x/xxx.ext + root/class_x/xxy.ext + root/class_x/xxz.ext + + root/class_y/123.ext + root/class_y/nsdf3.ext + root/class_y/asd932_.ext + + Args: + root (string): Root directory path. + loader (callable): A function to load a sample given its path. + extensions (tuple[string]): A list of allowed extensions. + both extensions and is_valid_file should not be passed. + transform (callable, optional): A function/transform that takes in + a sample and returns a transformed version. + E.g, ``transforms.RandomCrop`` for images. + target_transform (callable, optional): A function/transform that takes + in the target and transforms it. + is_valid_file (callable, optional): A function that takes path of a file + and check if the file is a valid file (used to check of corrupt files) + both extensions and is_valid_file should not be passed. + + Attributes: + classes (list): List of the class names sorted alphabetically. + class_to_idx (dict): Dict with items (class_name, class_index). + samples (list): List of (sample path, class_index) tuples + targets (list): The class_index value for each image in the dataset + """ + + def __init__( + self, + root: str, + loader: Callable[[str], Any], + extensions: Optional[Tuple[str, ...]] = None, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> None: + super(DatasetFolder, self).__init__(root, transform=transform, + target_transform=target_transform) + classes, class_to_idx = self._find_classes(self.root) + samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file) + if len(samples) == 0: + msg = "Found 0 files in subfolders of: {}\n".format(self.root) + if extensions is not None: + msg += "Supported extensions are: {}".format(",".join(extensions)) + raise RuntimeError(msg) + + self.loader = loader + self.extensions = extensions + + self.classes = classes + self.class_to_idx = class_to_idx + self.samples = samples + self.targets = [s[1] for s in samples] + + def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]: + """ + Finds the class folders in a dataset. + + Args: + dir (string): Root directory path. + + Returns: + tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary. + + Ensures: + No class is a subdirectory of another. + """ + classes = [d.name for d in os.scandir(dir) if d.is_dir()] + classes.sort() + class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} + return classes, class_to_idx + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + while True: + try: + path, target = self.samples[index] + sample = self.loader(path) + break + except Exception as e: + print(e) + index = random.randint(0, len(self.samples) - 1) + + if self.transform is not None: + sample = self.transform(sample) + if self.target_transform is not None: + target = self.target_transform(target) + + return sample, target + + def __len__(self) -> int: + return len(self.samples) + + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp') + + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') + + +# TODO: specify the return type +def accimage_loader(path: str) -> Any: + import accimage + try: + return accimage.Image(path) + except IOError: + # Potentially a decoding problem, fall back to PIL.Image + return pil_loader(path) + + +def default_loader(path: str) -> Any: + from torchvision import get_image_backend + if get_image_backend() == 'accimage': + return accimage_loader(path) + else: + return pil_loader(path) + + +class ImageFolder(DatasetFolder): + """A generic data loader where the images are arranged in this way: :: + + root/dog/xxx.png + root/dog/xxy.png + root/dog/xxz.png + + root/cat/123.png + root/cat/nsdf3.png + root/cat/asd932_.png + + Args: + root (string): Root directory path. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + loader (callable, optional): A function to load an image given its path. + is_valid_file (callable, optional): A function that takes path of an Image file + and check if the file is a valid file (used to check of corrupt files) + + Attributes: + classes (list): List of the class names sorted alphabetically. + class_to_idx (dict): Dict with items (class_name, class_index). + imgs (list): List of (image path, class_index) tuples + """ + + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, + is_valid_file: Optional[Callable[[str], bool]] = None, + ): + super(ImageFolder, self).__init__(root, loader, IMG_EXTENSIONS if is_valid_file is None else None, + transform=transform, + target_transform=target_transform, + is_valid_file=is_valid_file) + self.imgs = self.samples diff --git a/cv/Self-Supervised Learning/MAE/pytorch/datasets.py b/cv/Self-Supervised Learning/MAE/pytorch/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..b7f3e3c8a01d5fa9b23f151d514d4db6fb60f14e --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/datasets.py @@ -0,0 +1,133 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import os +import torch + +from torchvision import datasets, transforms + +from timm.data.constants import \ + IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD + +from timm.data import create_transform + +from masking_generator import RandomMaskingGenerator +from dataset_folder import ImageFolder + + +class DataAugmentationForMAE(object): + def __init__(self, args): + imagenet_default_mean_and_std = args.imagenet_default_mean_and_std + mean = IMAGENET_INCEPTION_MEAN if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_MEAN + std = IMAGENET_INCEPTION_STD if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_STD + + self.transform = transforms.Compose([ + transforms.RandomResizedCrop(args.input_size), + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor(mean), + std=torch.tensor(std)) + ]) + + self.masked_position_generator = RandomMaskingGenerator( + args.window_size, args.mask_ratio + ) + + def __call__(self, image): + return self.transform(image), self.masked_position_generator() + + def __repr__(self): + repr = "(DataAugmentationForBEiT,\n" + repr += " transform = %s,\n" % str(self.transform) + repr += " Masked position generator = %s,\n" % str(self.masked_position_generator) + repr += ")" + return repr + + +def build_pretraining_dataset(args): + transform = DataAugmentationForMAE(args) + print("Data Aug = %s" % str(transform)) + return ImageFolder(args.data_path, transform=transform) + + +def build_dataset(is_train, args): + transform = build_transform(is_train, args) + + print("Transform = ") + if isinstance(transform, tuple): + for trans in transform: + print(" - - - - - - - - - - ") + for t in trans.transforms: + print(t) + else: + for t in transform.transforms: + print(t) + print("---------------------------") + + if args.data_set == 'CIFAR': + dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform) + nb_classes = 100 + elif args.data_set == 'IMNET': + root = os.path.join(args.data_path, 'train' if is_train else 'val') + dataset = datasets.ImageFolder(root, transform=transform) + nb_classes = 1000 + elif args.data_set == "image_folder": + root = args.data_path if is_train else args.eval_data_path + dataset = ImageFolder(root, transform=transform) + nb_classes = args.nb_classes + assert len(dataset.class_to_idx) == nb_classes + else: + raise NotImplementedError() + assert nb_classes == args.nb_classes + print("Number of the class = %d" % args.nb_classes) + + return dataset, nb_classes + + +def build_transform(is_train, args): + resize_im = args.input_size > 32 + imagenet_default_mean_and_std = args.imagenet_default_mean_and_std + mean = IMAGENET_INCEPTION_MEAN if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_MEAN + std = IMAGENET_INCEPTION_STD if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_STD + + if is_train: + # this should always dispatch to transforms_imagenet_train + transform = create_transform( + input_size=args.input_size, + is_training=True, + color_jitter=args.color_jitter, + auto_augment=args.aa, + interpolation=args.train_interpolation, + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + mean=mean, + std=std, + ) + if not resize_im: + # replace RandomResizedCropAndInterpolation with + # RandomCrop + transform.transforms[0] = transforms.RandomCrop( + args.input_size, padding=4) + return transform + + t = [] + if resize_im: + if args.crop_pct is None: + if args.input_size < 384: + args.crop_pct = 224 / 256 + else: + args.crop_pct = 1.0 + size = int(args.input_size / args.crop_pct) + t.append( + transforms.Resize(size, interpolation=3), # to maintain same ratio w.r.t. 224 images + ) + t.append(transforms.CenterCrop(args.input_size)) + + t.append(transforms.ToTensor()) + t.append(transforms.Normalize(mean, std)) + return transforms.Compose(t) diff --git a/cv/Self-Supervised Learning/MAE/pytorch/engine_for_finetuning.py b/cv/Self-Supervised Learning/MAE/pytorch/engine_for_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..b79e965da6685df9f5248a726c3560f7ccf91839 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/engine_for_finetuning.py @@ -0,0 +1,182 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import math +import sys +from typing import Iterable, Optional + +import torch + +from timm.data import Mixup +from timm.utils import accuracy, ModelEma + +import utils + + +def train_class_batch(model, samples, target, criterion): + outputs = model(samples) + loss = criterion(outputs, target) + return loss, outputs + + +def get_loss_scale_for_deepspeed(model): + optimizer = model.optimizer + return optimizer.loss_scale if hasattr(optimizer, "loss_scale") else optimizer.cur_scale + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, + model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None, log_writer=None, + start_steps=None, lr_schedule_values=None, wd_schedule_values=None, + num_training_steps_per_epoch=None, update_freq=None): + model.train(True) + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + metric_logger.add_meter('min_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 10 + + if loss_scaler is None: + model.zero_grad() + model.micro_steps = 0 + else: + optimizer.zero_grad() + + for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + step = data_iter_step // update_freq + if step >= num_training_steps_per_epoch: + continue + it = start_steps + step # global training iteration + # Update LR & WD for the first acc + if lr_schedule_values is not None or wd_schedule_values is not None and data_iter_step % update_freq == 0: + for i, param_group in enumerate(optimizer.param_groups): + if lr_schedule_values is not None: + param_group["lr"] = lr_schedule_values[it] * param_group["lr_scale"] + if wd_schedule_values is not None and param_group["weight_decay"] > 0: + param_group["weight_decay"] = wd_schedule_values[it] + + samples = samples.to(device, non_blocking=True) + targets = targets.to(device, non_blocking=True) + + if mixup_fn is not None: + samples, targets = mixup_fn(samples, targets) + + if loss_scaler is None: + samples = samples.half() + loss, output = train_class_batch( + model, samples, targets, criterion) + else: + with torch.cuda.amp.autocast(): + loss, output = train_class_batch( + model, samples, targets, criterion) + + loss_value = loss.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + if loss_scaler is None: + loss /= update_freq + model.backward(loss) + model.step() + + if (data_iter_step + 1) % update_freq == 0: + # model.zero_grad() + # Deepspeed will call step() & model.zero_grad() automatic + if model_ema is not None: + model_ema.update(model) + grad_norm = None + loss_scale_value = get_loss_scale_for_deepspeed(model) + else: + # this attribute is added by timm on one optimizer (adahessian) + is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order + loss /= update_freq + grad_norm = loss_scaler(loss, optimizer, clip_grad=max_norm, + parameters=model.parameters(), create_graph=is_second_order, + update_grad=(data_iter_step + 1) % update_freq == 0) + if (data_iter_step + 1) % update_freq == 0: + optimizer.zero_grad() + if model_ema is not None: + model_ema.update(model) + loss_scale_value = loss_scaler.state_dict()["scale"] + + torch.cuda.synchronize() + + if mixup_fn is None: + class_acc = (output.max(-1)[-1] == targets).float().mean() + else: + class_acc = None + metric_logger.update(loss=loss_value) + metric_logger.update(class_acc=class_acc) + metric_logger.update(loss_scale=loss_scale_value) + min_lr = 10. + max_lr = 0. + for group in optimizer.param_groups: + min_lr = min(min_lr, group["lr"]) + max_lr = max(max_lr, group["lr"]) + + metric_logger.update(lr=max_lr) + metric_logger.update(min_lr=min_lr) + weight_decay_value = None + for group in optimizer.param_groups: + if group["weight_decay"] > 0: + weight_decay_value = group["weight_decay"] + metric_logger.update(weight_decay=weight_decay_value) + metric_logger.update(grad_norm=grad_norm) + + if log_writer is not None: + log_writer.update(loss=loss_value, head="loss") + log_writer.update(class_acc=class_acc, head="loss") + log_writer.update(loss_scale=loss_scale_value, head="opt") + log_writer.update(lr=max_lr, head="opt") + log_writer.update(min_lr=min_lr, head="opt") + log_writer.update(weight_decay=weight_decay_value, head="opt") + log_writer.update(grad_norm=grad_norm, head="opt") + + log_writer.set_step() + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def evaluate(data_loader, model, device): + criterion = torch.nn.CrossEntropyLoss() + + metric_logger = utils.MetricLogger(delimiter=" ") + header = 'Test:' + + # switch to evaluation mode + model.eval() + + for batch in metric_logger.log_every(data_loader, 10, header): + images = batch[0] + target = batch[-1] + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + with torch.cuda.amp.autocast(): + output = model(images) + loss = criterion(output, target) + + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + batch_size = images.shape[0] + metric_logger.update(loss=loss.item()) + metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) + metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' + .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) + + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} diff --git a/cv/Self-Supervised Learning/MAE/pytorch/engine_for_pretraining.py b/cv/Self-Supervised Learning/MAE/pytorch/engine_for_pretraining.py new file mode 100644 index 0000000000000000000000000000000000000000..e0977f98964ab2b7a99f58d725789c2e10015d29 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/engine_for_pretraining.py @@ -0,0 +1,116 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import math +import sys +from typing import Iterable + +import torch +import torch.nn as nn + +import utils +from einops import rearrange +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +def train_one_epoch(model: torch.nn.Module, data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, patch_size: int = 16, + normlize_target: bool = True, log_writer=None, lr_scheduler=None, start_steps=None, + lr_schedule_values=None, wd_schedule_values=None): + model.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + metric_logger.add_meter('min_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 10 + + loss_func = nn.MSELoss() + + for step, (batch, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + # assign learning rate & weight decay for each step + it = start_steps + step # global training iteration + if lr_schedule_values is not None or wd_schedule_values is not None: + for i, param_group in enumerate(optimizer.param_groups): + if lr_schedule_values is not None: + param_group["lr"] = lr_schedule_values[it] * param_group["lr_scale"] + if wd_schedule_values is not None and param_group["weight_decay"] > 0: + param_group["weight_decay"] = wd_schedule_values[it] + + images, bool_masked_pos = batch + images = images.to(device, non_blocking=True) + bool_masked_pos = bool_masked_pos.to(device, non_blocking=True).flatten(1).to(torch.bool) + + # import pdb; pdb.set_trace() + with torch.no_grad(): + # calculate the predict label + mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device)[None, :, None, None] + std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device)[None, :, None, None] + unnorm_images = images * std + mean # in [0, 1] + + if normlize_target: + images_squeeze = rearrange(unnorm_images, 'b c (h p1) (w p2) -> b (h w) (p1 p2) c', p1=patch_size, p2=patch_size) + images_norm = (images_squeeze - images_squeeze.mean(dim=-2, keepdim=True) + ) / (images_squeeze.var(dim=-2, unbiased=True, keepdim=True).sqrt() + 1e-6) + # we find that the mean is about 0.48 and standard deviation is about 0.08. + images_patch = rearrange(images_norm, 'b n p c -> b n (p c)') + else: + images_patch = rearrange(unnorm_images, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size) + + B, _, C = images_patch.shape + labels = images_patch[bool_masked_pos].reshape(B, -1, C) + + with torch.cuda.amp.autocast(): + outputs = model(images, bool_masked_pos) + loss = loss_func(input=outputs, target=labels) + + loss_value = loss.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + optimizer.zero_grad() + # this attribute is added by timm on one optimizer (adahessian) + is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order + grad_norm = loss_scaler(loss, optimizer, clip_grad=max_norm, + parameters=model.parameters(), create_graph=is_second_order) + loss_scale_value = loss_scaler.state_dict()["scale"] + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + metric_logger.update(loss_scale=loss_scale_value) + min_lr = 10. + max_lr = 0. + for group in optimizer.param_groups: + min_lr = min(min_lr, group["lr"]) + max_lr = max(max_lr, group["lr"]) + + metric_logger.update(lr=max_lr) + metric_logger.update(min_lr=min_lr) + weight_decay_value = None + for group in optimizer.param_groups: + if group["weight_decay"] > 0: + weight_decay_value = group["weight_decay"] + metric_logger.update(weight_decay=weight_decay_value) + metric_logger.update(grad_norm=grad_norm) + + if log_writer is not None: + log_writer.update(loss=loss_value, head="loss") + log_writer.update(loss_scale=loss_scale_value, head="opt") + log_writer.update(lr=max_lr, head="opt") + log_writer.update(min_lr=min_lr, head="opt") + log_writer.update(weight_decay=weight_decay_value, head="opt") + log_writer.update(grad_norm=grad_norm, head="opt") + + log_writer.set_step() + + if lr_scheduler is not None: + lr_scheduler.step_update(start_steps + step) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} diff --git a/cv/Self-Supervised Learning/MAE/pytorch/files/ILSVRC2012_val_00031649.JPEG b/cv/Self-Supervised Learning/MAE/pytorch/files/ILSVRC2012_val_00031649.JPEG new file mode 100644 index 0000000000000000000000000000000000000000..e7f5c23b8bfc750712322381810f3afe6ab5afbf Binary files /dev/null and b/cv/Self-Supervised Learning/MAE/pytorch/files/ILSVRC2012_val_00031649.JPEG differ diff --git a/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_base_0.75_400e.txt b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_base_0.75_400e.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a771f6e4b339e9cb9196b0bfe969311e7315fb6 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_base_0.75_400e.txt @@ -0,0 +1,400 @@ +{"train_lr": 2.99062424873788e-05, "train_min_lr": 2.99062424873788e-05, "train_loss": 0.9935376493690106, "train_loss_scale": 65536.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03353406208453891, "epoch": 0, "n_parameters": 93325440} +{"train_lr": 8.991105056494908e-05, "train_min_lr": 8.991105056494908e-05, "train_loss": 0.9758817947302492, "train_loss_scale": 65536.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.08507240483035836, "epoch": 1, "n_parameters": 93325440} +{"train_lr": 0.0001499158586425194, "train_min_lr": 0.0001499158586425194, "train_loss": 0.9611001945793246, "train_loss_scale": 65536.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.128755036335534, "epoch": 2, "n_parameters": 93325440} +{"train_lr": 0.00020992066672008975, "train_min_lr": 0.00020992066672008975, "train_loss": 0.9310973301434364, "train_loss_scale": 65536.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.17579967240826824, "epoch": 3, "n_parameters": 93325440} +{"train_lr": 0.00026992547479766013, "train_min_lr": 0.00026992547479766013, "train_loss": 0.887464275273184, "train_loss_scale": 65536.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.2299580722092054, "epoch": 4, "n_parameters": 93325440} +{"train_lr": 0.0003299302828752303, "train_min_lr": 0.0003299302828752303, "train_loss": 0.8575904849821177, "train_loss_scale": 65536.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.22862222800270104, "epoch": 5, "n_parameters": 93325440} +{"train_lr": 0.00038993509095280063, "train_min_lr": 0.00038993509095280063, "train_loss": 0.8232777749355404, "train_loss_scale": 104395.48717948717, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.21724188721810395, "epoch": 6, "n_parameters": 93325440} +{"train_lr": 0.00044993989903037104, "train_min_lr": 0.00044993989903037104, "train_loss": 0.7865594529952759, "train_loss_scale": 131072.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.19654494937127218, "epoch": 7, "n_parameters": 93325440} +{"train_lr": 0.0005099447071079412, "train_min_lr": 0.0005099447071079412, "train_loss": 0.7636158630156364, "train_loss_scale": 131072.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.171916592746782, "epoch": 8, "n_parameters": 93325440} +{"train_lr": 0.0005699495151855116, "train_min_lr": 0.0005699495151855116, "train_loss": 0.7478578644924057, "train_loss_scale": 131072.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.1429553751188975, "epoch": 9, "n_parameters": 93325440} +{"train_lr": 0.0006299543232630818, "train_min_lr": 0.0006299543232630818, "train_loss": 0.7356351461404791, "train_loss_scale": 131072.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.12458268381082095, "epoch": 10, "n_parameters": 93325440} +{"train_lr": 0.0006899591313406522, "train_min_lr": 0.0006899591313406522, "train_loss": 0.726113576346483, "train_loss_scale": 131072.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.11398386040654702, "epoch": 11, "n_parameters": 93325440} +{"train_lr": 0.0007499639394182228, "train_min_lr": 0.0007499639394182228, "train_loss": 0.71920872768626, "train_loss_scale": 155017.84615384616, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.10540476584663758, "epoch": 12, "n_parameters": 93325440} +{"train_lr": 0.0008099687474957929, "train_min_lr": 0.0008099687474957929, "train_loss": 0.7139992883715492, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.10431485875055958, "epoch": 13, "n_parameters": 93325440} +{"train_lr": 0.0008699735555733632, "train_min_lr": 0.0008699735555733632, "train_loss": 0.708360363323337, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0887956558010326, "epoch": 14, "n_parameters": 93325440} +{"train_lr": 0.0009299783636509334, "train_min_lr": 0.0009299783636509334, "train_loss": 0.7042842504735558, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.08402582650813155, "epoch": 15, "n_parameters": 93325440} +{"train_lr": 0.0009899831717285039, "train_min_lr": 0.0009899831717285039, "train_loss": 0.7004435619769188, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07971460644442302, "epoch": 16, "n_parameters": 93325440} +{"train_lr": 0.001049987979806074, "train_min_lr": 0.001049987979806074, "train_loss": 0.6999204354838301, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.08760032328800896, "epoch": 17, "n_parameters": 93325440} +{"train_lr": 0.0011099927878836444, "train_min_lr": 0.0011099927878836444, "train_loss": 0.6953817655881628, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07566600679778136, "epoch": 18, "n_parameters": 93325440} +{"train_lr": 0.0011699975959612145, "train_min_lr": 0.0011699975959612145, "train_loss": 0.6931444115411395, "train_loss_scale": 464633.4358974359, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07172701840933698, "epoch": 19, "n_parameters": 93325440} +{"train_lr": 0.0012300024040387849, "train_min_lr": 0.0012300024040387849, "train_loss": 0.6906391145565953, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06653258443260804, "epoch": 20, "n_parameters": 93325440} +{"train_lr": 0.0012900072121163552, "train_min_lr": 0.0012900072121163552, "train_loss": 0.6887923278965247, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06384101945859118, "epoch": 21, "n_parameters": 93325440} +{"train_lr": 0.0013500120201939254, "train_min_lr": 0.0013500120201939254, "train_loss": 0.6873886124350321, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06326920523618658, "epoch": 22, "n_parameters": 93325440} +{"train_lr": 0.0014100168282714964, "train_min_lr": 0.0014100168282714964, "train_loss": 0.6858006822518431, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06038653005201083, "epoch": 23, "n_parameters": 93325440} +{"train_lr": 0.0014700216363490658, "train_min_lr": 0.0014700216363490658, "train_loss": 0.6844061892479658, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.057754088969280325, "epoch": 24, "n_parameters": 93325440} +{"train_lr": 0.0015300264444266366, "train_min_lr": 0.0015300264444266366, "train_loss": 0.6839535279342761, "train_loss_scale": 714174.358974359, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05955945764883207, "epoch": 25, "n_parameters": 93325440} +{"train_lr": 0.0015900312525042061, "train_min_lr": 0.0015900312525042061, "train_loss": 0.6827620689351207, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05615876699821689, "epoch": 26, "n_parameters": 93325440} +{"train_lr": 0.0016500360605817771, "train_min_lr": 0.0016500360605817771, "train_loss": 0.6814118546123306, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05241773727660378, "epoch": 27, "n_parameters": 93325440} +{"train_lr": 0.0017100408686593481, "train_min_lr": 0.0017100408686593481, "train_loss": 0.6803960090455337, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.050834771377058365, "epoch": 28, "n_parameters": 93325440} +{"train_lr": 0.0017700456767369176, "train_min_lr": 0.0017700456767369176, "train_loss": 0.6797578791156411, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05221204828614226, "epoch": 29, "n_parameters": 93325440} +{"train_lr": 0.0018300504848144882, "train_min_lr": 0.0018300504848144882, "train_loss": 0.678908175502259, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04822072399278673, "epoch": 30, "n_parameters": 93325440} +{"train_lr": 0.001890055292892058, "train_min_lr": 0.001890055292892058, "train_loss": 0.67856838229375, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.048526402544946626, "epoch": 31, "n_parameters": 93325440} +{"train_lr": 0.0019500601009696296, "train_min_lr": 0.0019500601009696296, "train_loss": 0.6778434767411687, "train_loss_scale": 2046739.6923076923, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04580618743187724, "epoch": 32, "n_parameters": 93325440} +{"train_lr": 0.0020100649090471993, "train_min_lr": 0.0020100649090471993, "train_loss": 0.6768887781370909, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.044690437405967176, "epoch": 33, "n_parameters": 93325440} +{"train_lr": 0.002070069717124769, "train_min_lr": 0.002070069717124769, "train_loss": 0.6764640975743532, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.044290755033636324, "epoch": 34, "n_parameters": 93325440} +{"train_lr": 0.0021300745252023395, "train_min_lr": 0.0021300745252023395, "train_loss": 0.6758714542748072, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04358653886578022, "epoch": 35, "n_parameters": 93325440} +{"train_lr": 0.0021900793332799103, "train_min_lr": 0.0021900793332799103, "train_loss": 0.6754509876601589, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.041925776057327405, "epoch": 36, "n_parameters": 93325440} +{"train_lr": 0.00225008414135748, "train_min_lr": 0.00225008414135748, "train_loss": 0.6753806370095565, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04360225025373392, "epoch": 37, "n_parameters": 93325440} +{"train_lr": 0.002310088949435051, "train_min_lr": 0.002310088949435051, "train_loss": 0.6767300866448727, "train_loss_scale": 283149.1282051282, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 38, "n_parameters": 93325440} +{"train_lr": 0.0023700937575126205, "train_min_lr": 0.0023700937575126205, "train_loss": 0.6744202525617603, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039843473296899065, "epoch": 39, "n_parameters": 93325440} +{"train_lr": 0.002399984905490592, "train_min_lr": 0.002399984905490592, "train_loss": 0.6737221590697001, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03926232078661903, "epoch": 40, "n_parameters": 93325440} +{"train_lr": 0.0023998940486030145, "train_min_lr": 0.0023998940486030145, "train_loss": 0.6729661250869051, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03798527103992036, "epoch": 41, "n_parameters": 93325440} +{"train_lr": 0.0023997121959074114, "train_min_lr": 0.0023997121959074114, "train_loss": 0.6726540239671102, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03898819892977675, "epoch": 42, "n_parameters": 93325440} +{"train_lr": 0.0023994393612525775, "train_min_lr": 0.0023994393612525775, "train_loss": 0.6718972477202232, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03660729594934636, "epoch": 43, "n_parameters": 93325440} +{"train_lr": 0.002399075565415922, "train_min_lr": 0.002399075565415922, "train_loss": 0.6716325357556343, "train_loss_scale": 412540.71794871794, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03810084838038071, "epoch": 44, "n_parameters": 93325440} +{"train_lr": 0.0023986208361019097, "train_min_lr": 0.0023986208361019097, "train_loss": 0.6708741470311697, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.035760302478686355, "epoch": 45, "n_parameters": 93325440} +{"train_lr": 0.002398075207939935, "train_min_lr": 0.002398075207939935, "train_loss": 0.6701816931032599, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0361831144680484, "epoch": 46, "n_parameters": 93325440} +{"train_lr": 0.002397438722481704, "train_min_lr": 0.002397438722481704, "train_loss": 0.6697720118010273, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03561555056904371, "epoch": 47, "n_parameters": 93325440} +{"train_lr": 0.002396711428198033, "train_min_lr": 0.002396711428198033, "train_loss": 0.6699321212438054, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03857750490379448, "epoch": 48, "n_parameters": 93325440} +{"train_lr": 0.00239589338047521, "train_min_lr": 0.00239589338047521, "train_loss": 0.6692431434654654, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03674515096757274, "epoch": 49, "n_parameters": 93325440} +{"train_lr": 0.0023949846416107326, "train_min_lr": 0.0023949846416107326, "train_loss": 0.6711435524078134, "train_loss_scale": 609988.9230769231, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039313955614582084, "epoch": 50, "n_parameters": 93325440} +{"train_lr": 0.0023939852808085834, "train_min_lr": 0.0023939852808085834, "train_loss": 0.6686080549724209, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0351801153857452, "epoch": 51, "n_parameters": 93325440} +{"train_lr": 0.0023928953741739565, "train_min_lr": 0.0023928953741739565, "train_loss": 0.6680617731733199, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03496479747506479, "epoch": 52, "n_parameters": 93325440} +{"train_lr": 0.002391715004707465, "train_min_lr": 0.002391715004707465, "train_loss": 0.6678077465591904, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0346150501177479, "epoch": 53, "n_parameters": 93325440} +{"train_lr": 0.002390444262298807, "train_min_lr": 0.002390444262298807, "train_loss": 0.667374048788005, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03409873149118935, "epoch": 54, "n_parameters": 93325440} +{"train_lr": 0.002389083243719943, "train_min_lr": 0.002389083243719943, "train_loss": 0.6669572112986293, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.034755515513750605, "epoch": 55, "n_parameters": 93325440} +{"train_lr": 0.002387632052617705, "train_min_lr": 0.002387632052617705, "train_loss": 0.6681992361942927, "train_loss_scale": 809957.7435897436, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 56, "n_parameters": 93325440} +{"train_lr": 0.0023860907995059146, "train_min_lr": 0.0023860907995059146, "train_loss": 0.6667240882913271, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03386468641483822, "epoch": 57, "n_parameters": 93325440} +{"train_lr": 0.002384459601756962, "train_min_lr": 0.002384459601756962, "train_loss": 0.6663572651405747, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033974862639577344, "epoch": 58, "n_parameters": 93325440} +{"train_lr": 0.0023827385835928716, "train_min_lr": 0.0023827385835928716, "train_loss": 0.6660926373054584, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03487831858010628, "epoch": 59, "n_parameters": 93325440} +{"train_lr": 0.002380927876075842, "train_min_lr": 0.002380927876075842, "train_loss": 0.665804479462214, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0342934140159438, "epoch": 60, "n_parameters": 93325440} +{"train_lr": 0.0023790276170982585, "train_min_lr": 0.0023790276170982585, "train_loss": 0.6654091311188844, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03340641431247768, "epoch": 61, "n_parameters": 93325440} +{"train_lr": 0.002377037951372201, "train_min_lr": 0.002377037951372201, "train_loss": 0.6653385141339058, "train_loss_scale": 547813.7435897436, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03390334472537805, "epoch": 62, "n_parameters": 93325440} +{"train_lr": 0.0023749590304184146, "train_min_lr": 0.0023749590304184146, "train_loss": 0.6650327132441677, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03470497362267894, "epoch": 63, "n_parameters": 93325440} +{"train_lr": 0.002372791012554783, "train_min_lr": 0.002372791012554783, "train_loss": 0.6656214212521147, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03538080747239292, "epoch": 64, "n_parameters": 93325440} +{"train_lr": 0.0023705340628842582, "train_min_lr": 0.0023705340628842582, "train_loss": 0.664771322089319, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03493891082083186, "epoch": 65, "n_parameters": 93325440} +{"train_lr": 0.002368188353282295, "train_min_lr": 0.002368188353282295, "train_loss": 0.6646730641428477, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03361503693919916, "epoch": 66, "n_parameters": 93325440} +{"train_lr": 0.0023657540623837642, "train_min_lr": 0.0023657540623837642, "train_loss": 0.6643176994119318, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033847464117197655, "epoch": 67, "n_parameters": 93325440} +{"train_lr": 0.00236323137556934, "train_min_lr": 0.00236323137556934, "train_loss": 0.6641596989133037, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03371662331912189, "epoch": 68, "n_parameters": 93325440} +{"train_lr": 0.0023606204849513923, "train_min_lr": 0.0023606204849513923, "train_loss": 0.6640265851926345, "train_loss_scale": 1714018.4615384615, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03474894096740545, "epoch": 69, "n_parameters": 93325440} +{"train_lr": 0.002357921589359349, "train_min_lr": 0.002357921589359349, "train_loss": 0.6637311494933107, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033441266182284705, "epoch": 70, "n_parameters": 93325440} +{"train_lr": 0.002355134894324556, "train_min_lr": 0.002355134894324556, "train_loss": 0.6635217848353279, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03384943720765221, "epoch": 71, "n_parameters": 93325440} +{"train_lr": 0.0023522606120646365, "train_min_lr": 0.0023522606120646365, "train_loss": 0.6633951249890603, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033622669599329434, "epoch": 72, "n_parameters": 93325440} +{"train_lr": 0.002349298961467303, "train_min_lr": 0.002349298961467303, "train_loss": 0.6633510416707932, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03442244214984851, "epoch": 73, "n_parameters": 93325440} +{"train_lr": 0.0023462501680737214, "train_min_lr": 0.0023462501680737214, "train_loss": 0.663141653347665, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033474423409176945, "epoch": 74, "n_parameters": 93325440} +{"train_lr": 0.0023431144640613144, "train_min_lr": 0.0023431144640613144, "train_loss": 0.6629860976185554, "train_loss_scale": 2567666.871794872, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03471351237012408, "epoch": 75, "n_parameters": 93325440} +{"train_lr": 0.0023398920882260776, "train_min_lr": 0.0023398920882260776, "train_loss": 0.6628032483351536, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03359772727633707, "epoch": 76, "n_parameters": 93325440} +{"train_lr": 0.002336583285964409, "train_min_lr": 0.002336583285964409, "train_loss": 0.6635485236079265, "train_loss_scale": 4080036.1025641025, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 77, "n_parameters": 93325440} +{"train_lr": 0.0023331883092544115, "train_min_lr": 0.0023331883092544115, "train_loss": 0.6628299321597203, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033620487104575984, "epoch": 78, "n_parameters": 93325440} +{"train_lr": 0.0023297074166367046, "train_min_lr": 0.0023297074166367046, "train_loss": 0.6625400330537022, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033616363954467654, "epoch": 79, "n_parameters": 93325440} +{"train_lr": 0.0023261408731947413, "train_min_lr": 0.0023261408731947413, "train_loss": 0.6622193893656517, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03423108916896849, "epoch": 80, "n_parameters": 93325440} +{"train_lr": 0.002322488950534608, "train_min_lr": 0.002322488950534608, "train_loss": 0.6621070915164474, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.033666627577100046, "epoch": 81, "n_parameters": 93325440} +{"train_lr": 0.0023187519267643623, "train_min_lr": 0.0023187519267643623, "train_loss": 0.6622220281845866, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03502245578102958, "epoch": 82, "n_parameters": 93325440} +{"train_lr": 0.0023149300864728226, "train_min_lr": 0.0023149300864728226, "train_loss": 0.6620114981316221, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03415211925330835, "epoch": 83, "n_parameters": 93325440} +{"train_lr": 0.00231102372070793, "train_min_lr": 0.00231102372070793, "train_loss": 0.6617905595459235, "train_loss_scale": 3448201.846153846, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03398606629492954, "epoch": 84, "n_parameters": 93325440} +{"train_lr": 0.002307033126954561, "train_min_lr": 0.002307033126954561, "train_loss": 0.6629372512062008, "train_loss_scale": 1459436.3076923077, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 85, "n_parameters": 93325440} +{"train_lr": 0.002302958609111882, "train_min_lr": 0.002302958609111882, "train_loss": 0.661601463977534, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03439310951850926, "epoch": 86, "n_parameters": 93325440} +{"train_lr": 0.002298800477470194, "train_min_lr": 0.002298800477470194, "train_loss": 0.6614590679319241, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03440534176591497, "epoch": 87, "n_parameters": 93325440} +{"train_lr": 0.0022945590486873305, "train_min_lr": 0.0022945590486873305, "train_loss": 0.6613834299242649, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.034254728303028226, "epoch": 88, "n_parameters": 93325440} +{"train_lr": 0.0022902346457645086, "train_min_lr": 0.0022902346457645086, "train_loss": 0.6611541778279039, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03509240785542016, "epoch": 89, "n_parameters": 93325440} +{"train_lr": 0.002285827598021753, "train_min_lr": 0.002285827598021753, "train_loss": 0.6612528438608234, "train_loss_scale": 262144.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03502046571781811, "epoch": 90, "n_parameters": 93325440} +{"train_lr": 0.0022813382410728175, "train_min_lr": 0.0022813382410728175, "train_loss": 0.6609599692269396, "train_loss_scale": 333561.4358974359, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03530590945424942, "epoch": 91, "n_parameters": 93325440} +{"train_lr": 0.0022767669167996093, "train_min_lr": 0.0022767669167996093, "train_loss": 0.6609104336597599, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.034537219042436056, "epoch": 92, "n_parameters": 93325440} +{"train_lr": 0.002272113973326174, "train_min_lr": 0.002272113973326174, "train_loss": 0.6607935074955608, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.035604342979450636, "epoch": 93, "n_parameters": 93325440} +{"train_lr": 0.00226737976499217, "train_min_lr": 0.00226737976499217, "train_loss": 0.6625717931355422, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0398545018158471, "epoch": 94, "n_parameters": 93325440} +{"train_lr": 0.0022625646523258907, "train_min_lr": 0.0022625646523258907, "train_loss": 0.6607921089594945, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.034550254352581806, "epoch": 95, "n_parameters": 93325440} +{"train_lr": 0.002257669002016808, "train_min_lr": 0.002257669002016808, "train_loss": 0.6605105223372961, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.034636476095049426, "epoch": 96, "n_parameters": 93325440} +{"train_lr": 0.0022526931868876465, "train_min_lr": 0.0022526931868876465, "train_loss": 0.6604859339407622, "train_loss_scale": 524288.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.035125622901922234, "epoch": 97, "n_parameters": 93325440} +{"train_lr": 0.0022476375858659957, "train_min_lr": 0.0022476375858659957, "train_loss": 0.6602891938378795, "train_loss_scale": 976318.358974359, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03531917932634361, "epoch": 98, "n_parameters": 93325440} +{"train_lr": 0.002242502583955447, "train_min_lr": 0.002242502583955447, "train_loss": 0.6603036113083363, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03573522149012066, "epoch": 99, "n_parameters": 93325440} +{"train_lr": 0.0022372885722062746, "train_min_lr": 0.0022372885722062746, "train_loss": 0.6603299641790681, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03621007654314431, "epoch": 100, "n_parameters": 93325440} +{"train_lr": 0.00223199594768566, "train_min_lr": 0.00223199594768566, "train_loss": 0.6600742791659939, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03562110195008035, "epoch": 101, "n_parameters": 93325440} +{"train_lr": 0.002226625113447457, "train_min_lr": 0.002226625113447457, "train_loss": 0.659991750254845, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03583730082027614, "epoch": 102, "n_parameters": 93325440} +{"train_lr": 0.0022211764785014763, "train_min_lr": 0.0022211764785014763, "train_loss": 0.6598965032503773, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.035701336876417585, "epoch": 103, "n_parameters": 93325440} +{"train_lr": 0.002215650457782375, "train_min_lr": 0.002215650457782375, "train_loss": 0.6597276878996919, "train_loss_scale": 1522451.6923076923, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03592537800208307, "epoch": 104, "n_parameters": 93325440} +{"train_lr": 0.0022100474721180197, "train_min_lr": 0.0022100474721180197, "train_loss": 0.6596770419094425, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0358598633036495, "epoch": 105, "n_parameters": 93325440} +{"train_lr": 0.0022043679481974616, "train_min_lr": 0.0022043679481974616, "train_loss": 0.659525745572188, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.035852226828678675, "epoch": 106, "n_parameters": 93325440} +{"train_lr": 0.0021986123185384417, "train_min_lr": 0.0021986123185384417, "train_loss": 0.6594595561902492, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03639518651060569, "epoch": 107, "n_parameters": 93325440} +{"train_lr": 0.002192781021454436, "train_min_lr": 0.002192781021454436, "train_loss": 0.6592210801079487, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.035937764580385424, "epoch": 108, "n_parameters": 93325440} +{"train_lr": 0.0021868745010212983, "train_min_lr": 0.0021868745010212983, "train_loss": 0.6593411502261193, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03644818510525884, "epoch": 109, "n_parameters": 93325440} +{"train_lr": 0.0021808932070434225, "train_min_lr": 0.0021808932070434225, "train_loss": 0.659102826904601, "train_loss_scale": 2184533.3333333335, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03611473233486789, "epoch": 110, "n_parameters": 93325440} +{"train_lr": 0.002174837595019509, "train_min_lr": 0.002174837595019509, "train_loss": 0.6592099234843866, "train_loss_scale": 3112119.794871795, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 111, "n_parameters": 93325440} +{"train_lr": 0.0021687081261078578, "train_min_lr": 0.0021687081261078578, "train_loss": 0.6590672825486996, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03640902054328949, "epoch": 112, "n_parameters": 93325440} +{"train_lr": 0.0021625052670912522, "train_min_lr": 0.0021625052670912522, "train_loss": 0.6590020086807318, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.036369339479372285, "epoch": 113, "n_parameters": 93325440} +{"train_lr": 0.0021562294903414267, "train_min_lr": 0.0021562294903414267, "train_loss": 0.6589464796945835, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03698857306526639, "epoch": 114, "n_parameters": 93325440} +{"train_lr": 0.0021498812737830776, "train_min_lr": 0.0021498812737830776, "train_loss": 0.6587196541233704, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03618161466856224, "epoch": 115, "n_parameters": 93325440} +{"train_lr": 0.0021434611008574723, "train_min_lr": 0.0021434611008574723, "train_loss": 0.6586414177257282, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03689980891963037, "epoch": 116, "n_parameters": 93325440} +{"train_lr": 0.002136969460485639, "train_min_lr": 0.002136969460485639, "train_loss": 0.658656991349581, "train_loss_scale": 2318966.153846154, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03741015896248894, "epoch": 117, "n_parameters": 93325440} +{"train_lr": 0.002130406847031118, "train_min_lr": 0.002130406847031118, "train_loss": 0.6585542277600138, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03709424373049002, "epoch": 118, "n_parameters": 93325440} +{"train_lr": 0.002123773760262341, "train_min_lr": 0.002123773760262341, "train_loss": 0.6583075237054473, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03686075314927178, "epoch": 119, "n_parameters": 93325440} +{"train_lr": 0.002117070705314543, "train_min_lr": 0.002117070705314543, "train_loss": 0.6585102182072706, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.038124814844475344, "epoch": 120, "n_parameters": 93325440} +{"train_lr": 0.0021102981926513073, "train_min_lr": 0.0021102981926513073, "train_loss": 0.6583685486887892, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03707056720621693, "epoch": 121, "n_parameters": 93325440} +{"train_lr": 0.0021034567380257023, "train_min_lr": 0.0021034567380257023, "train_loss": 0.658206457654253, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.036788777669127554, "epoch": 122, "n_parameters": 93325440} +{"train_lr": 0.0020965468624409753, "train_min_lr": 0.0020965468624409753, "train_loss": 0.6580094202923087, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.037147117468218006, "epoch": 123, "n_parameters": 93325440} +{"train_lr": 0.002089569092110911, "train_min_lr": 0.002089569092110911, "train_loss": 0.6592672796537861, "train_loss_scale": 3428036.923076923, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 124, "n_parameters": 93325440} +{"train_lr": 0.0020825239584197327, "train_min_lr": 0.0020825239584197327, "train_loss": 0.658130434437249, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03721978203154718, "epoch": 125, "n_parameters": 93325440} +{"train_lr": 0.0020754119978816502, "train_min_lr": 0.0020754119978816502, "train_loss": 0.6580190308009967, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.037171452276360914, "epoch": 126, "n_parameters": 93325440} +{"train_lr": 0.0020682337520999913, "train_min_lr": 0.0020682337520999913, "train_loss": 0.657770600862419, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03729113473747976, "epoch": 127, "n_parameters": 93325440} +{"train_lr": 0.0020609897677259627, "train_min_lr": 0.0020609897677259627, "train_loss": 0.6576646924591981, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03765304574074271, "epoch": 128, "n_parameters": 93325440} +{"train_lr": 0.002053680596417025, "train_min_lr": 0.002053680596417025, "train_loss": 0.6576654529199004, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03781039642098431, "epoch": 129, "n_parameters": 93325440} +{"train_lr": 0.00204630679479487, "train_min_lr": 0.00204630679479487, "train_loss": 0.6575054912469708, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03823968046941818, "epoch": 130, "n_parameters": 93325440} +{"train_lr": 0.002038868924403038, "train_min_lr": 0.002038868924403038, "train_loss": 0.6574385501921941, "train_loss_scale": 2258471.3846153845, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 131, "n_parameters": 93325440} +{"train_lr": 0.0020313675516641576, "train_min_lr": 0.0020313675516641576, "train_loss": 0.6573616523200121, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03814637377404441, "epoch": 132, "n_parameters": 93325440} +{"train_lr": 0.0020238032478368064, "train_min_lr": 0.0020238032478368064, "train_loss": 0.6572599850642757, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03755328234117956, "epoch": 133, "n_parameters": 93325440} +{"train_lr": 0.002016176588972008, "train_min_lr": 0.002016176588972008, "train_loss": 0.657276330444102, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03865379040750364, "epoch": 134, "n_parameters": 93325440} +{"train_lr": 0.002008488155869361, "train_min_lr": 0.002008488155869361, "train_loss": 0.6572356811270882, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.038243835081513494, "epoch": 135, "n_parameters": 93325440} +{"train_lr": 0.002000738534032814, "train_min_lr": 0.002000738534032814, "train_loss": 0.6577621484414126, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04061774171602268, "epoch": 136, "n_parameters": 93325440} +{"train_lr": 0.0019929283136260727, "train_min_lr": 0.0019929283136260727, "train_loss": 0.6570336658698626, "train_loss_scale": 3145728.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0380012393833544, "epoch": 137, "n_parameters": 93325440} +{"train_lr": 0.001985058089427659, "train_min_lr": 0.001985058089427659, "train_loss": 0.6569213079622923, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.037724959305845775, "epoch": 138, "n_parameters": 93325440} +{"train_lr": 0.0019771284607856218, "train_min_lr": 0.0019771284607856218, "train_loss": 0.6569687100366141, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03889632499060379, "epoch": 139, "n_parameters": 93325440} +{"train_lr": 0.0019691400315718726, "train_min_lr": 0.0019691400315718726, "train_loss": 0.6568288122041103, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03823023961665921, "epoch": 140, "n_parameters": 93325440} +{"train_lr": 0.001961093410136237, "train_min_lr": 0.001961093410136237, "train_loss": 0.6566447338137107, "train_loss_scale": 2285357.9487179485, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 141, "n_parameters": 93325440} +{"train_lr": 0.0019529892092600813, "train_min_lr": 0.0019529892092600813, "train_loss": 0.6577811507412639, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0412411448927835, "epoch": 142, "n_parameters": 93325440} +{"train_lr": 0.0019448280461096836, "train_min_lr": 0.0019448280461096836, "train_loss": 0.6566688927750175, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03831841904096878, "epoch": 143, "n_parameters": 93325440} +{"train_lr": 0.0019366105421892137, "train_min_lr": 0.0019366105421892137, "train_loss": 0.6564733442874291, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03837170583219864, "epoch": 144, "n_parameters": 93325440} +{"train_lr": 0.0019283373232934099, "train_min_lr": 0.0019283373232934099, "train_loss": 0.6564794568201671, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03877275445474646, "epoch": 145, "n_parameters": 93325440} +{"train_lr": 0.0019200090194599236, "train_min_lr": 0.0019200090194599236, "train_loss": 0.6564271587591904, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04010523812702069, "epoch": 146, "n_parameters": 93325440} +{"train_lr": 0.0019116262649213377, "train_min_lr": 0.0019116262649213377, "train_loss": 0.656340552183489, "train_loss_scale": 3145728.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0387989131327814, "epoch": 147, "n_parameters": 93325440} +{"train_lr": 0.0019031896980568602, "train_min_lr": 0.0019031896980568602, "train_loss": 0.6562293704169301, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039052358601624385, "epoch": 148, "n_parameters": 93325440} +{"train_lr": 0.001894699961343726, "train_min_lr": 0.001894699961343726, "train_loss": 0.6562115392910365, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039240701452422984, "epoch": 149, "n_parameters": 93325440} +{"train_lr": 0.0018861577013082516, "train_min_lr": 0.0018861577013082516, "train_loss": 0.6560243863421373, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03884188269073956, "epoch": 150, "n_parameters": 93325440} +{"train_lr": 0.0018775635684766133, "train_min_lr": 0.0018775635684766133, "train_loss": 0.6559707283830414, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039168536728725604, "epoch": 151, "n_parameters": 93325440} +{"train_lr": 0.0018689182173253027, "train_min_lr": 0.0018689182173253027, "train_loss": 0.6558644921303942, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03925456973509146, "epoch": 152, "n_parameters": 93325440} +{"train_lr": 0.0018602223062312783, "train_min_lr": 0.0018602223062312783, "train_loss": 0.6558279459340832, "train_loss_scale": 4207747.282051282, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 153, "n_parameters": 93325440} +{"train_lr": 0.0018514764974218371, "train_min_lr": 0.0018514764974218371, "train_loss": 0.6558603528313912, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039963523779685296, "epoch": 154, "n_parameters": 93325440} +{"train_lr": 0.0018426814569241794, "train_min_lr": 0.0018426814569241794, "train_loss": 0.6557053431916313, "train_loss_scale": 3313769.0256410255, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 155, "n_parameters": 93325440} +{"train_lr": 0.0018338378545146971, "train_min_lr": 0.0018338378545146971, "train_loss": 0.6557001805362793, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04054142555030875, "epoch": 156, "n_parameters": 93325440} +{"train_lr": 0.0018249463636679463, "train_min_lr": 0.0018249463636679463, "train_loss": 0.6555671931411593, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039896765908895016, "epoch": 157, "n_parameters": 93325440} +{"train_lr": 0.0018160076615053812, "train_min_lr": 0.0018160076615053812, "train_loss": 0.6556670764843241, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.040883598544706516, "epoch": 158, "n_parameters": 93325440} +{"train_lr": 0.0018070224287437813, "train_min_lr": 0.0018070224287437813, "train_loss": 0.6553049489664726, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0398384396177836, "epoch": 159, "n_parameters": 93325440} +{"train_lr": 0.0017979913496434085, "train_min_lr": 0.0017979913496434085, "train_loss": 0.6553408457205082, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04008960960886608, "epoch": 160, "n_parameters": 93325440} +{"train_lr": 0.0017889151119559006, "train_min_lr": 0.0017889151119559006, "train_loss": 0.6552601037785794, "train_loss_scale": 2117316.923076923, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.03976045259967064, "epoch": 161, "n_parameters": 93325440} +{"train_lr": 0.0017797944068718974, "train_min_lr": 0.0017797944068718974, "train_loss": 0.6551581882895567, "train_loss_scale": 3340655.5897435895, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 162, "n_parameters": 93325440} +{"train_lr": 0.0017706299289684047, "train_min_lr": 0.0017706299289684047, "train_loss": 0.6550808818772053, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04039769527168037, "epoch": 163, "n_parameters": 93325440} +{"train_lr": 0.0017614223761558967, "train_min_lr": 0.0017614223761558967, "train_loss": 0.654921959584149, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.039989605074366316, "epoch": 164, "n_parameters": 93325440} +{"train_lr": 0.001752172449625165, "train_min_lr": 0.001752172449625165, "train_loss": 0.6550507604693755, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.041154535164913304, "epoch": 165, "n_parameters": 93325440} +{"train_lr": 0.0017428808537939323, "train_min_lr": 0.0017428808537939323, "train_loss": 0.6547575006022667, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04056112305261195, "epoch": 166, "n_parameters": 93325440} +{"train_lr": 0.0017335482962531922, "train_min_lr": 0.0017335482962531922, "train_loss": 0.6546947882057, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04049342578181472, "epoch": 167, "n_parameters": 93325440} +{"train_lr": 0.0017241754877133318, "train_min_lr": 0.0017241754877133318, "train_loss": 0.6546552269838941, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04057711954467381, "epoch": 168, "n_parameters": 93325440} +{"train_lr": 0.0017147631419500143, "train_min_lr": 0.0017147631419500143, "train_loss": 0.6545772810156146, "train_loss_scale": 4187582.358974359, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04082152419962371, "epoch": 169, "n_parameters": 93325440} +{"train_lr": 0.0017053119757498118, "train_min_lr": 0.0017053119757498118, "train_loss": 0.6545756479534202, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.042030869087634176, "epoch": 170, "n_parameters": 93325440} +{"train_lr": 0.001695822708855617, "train_min_lr": 0.001695822708855617, "train_loss": 0.6545097664332925, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04112377291760193, "epoch": 171, "n_parameters": 93325440} +{"train_lr": 0.001686296063911845, "train_min_lr": 0.001686296063911845, "train_loss": 0.6543824052772461, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.040817254019948915, "epoch": 172, "n_parameters": 93325440} +{"train_lr": 0.0016767327664093945, "train_min_lr": 0.0016767327664093945, "train_loss": 0.6542876160536439, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04131795748327978, "epoch": 173, "n_parameters": 93325440} +{"train_lr": 0.0016671335446303921, "train_min_lr": 0.0016671335446303921, "train_loss": 0.6544181934725015, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04192646426291993, "epoch": 174, "n_parameters": 93325440} +{"train_lr": 0.0016574991295927436, "train_min_lr": 0.0016574991295927436, "train_loss": 0.6541326292432271, "train_loss_scale": 4207747.282051282, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 175, "n_parameters": 93325440} +{"train_lr": 0.001647830254994458, "train_min_lr": 0.001647830254994458, "train_loss": 0.654219655558849, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04151277960493015, "epoch": 176, "n_parameters": 93325440} +{"train_lr": 0.0016381276571577643, "train_min_lr": 0.0016381276571577643, "train_loss": 0.6540735674639925, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04252699216326269, "epoch": 177, "n_parameters": 93325440} +{"train_lr": 0.0016283920749730564, "train_min_lr": 0.0016283920749730564, "train_loss": 0.6539614871383096, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.042018215010802336, "epoch": 178, "n_parameters": 93325440} +{"train_lr": 0.0016186242498426112, "train_min_lr": 0.0016186242498426112, "train_loss": 0.6538805883043469, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04170390099692994, "epoch": 179, "n_parameters": 93325440} +{"train_lr": 0.0016088249256241284, "train_min_lr": 0.0016088249256241284, "train_loss": 0.6538529533128707, "train_loss_scale": 2144203.487179487, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 180, "n_parameters": 93325440} +{"train_lr": 0.0015989948485740878, "train_min_lr": 0.0015989948485740878, "train_loss": 0.6536841267146744, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04197185315812627, "epoch": 181, "n_parameters": 93325440} +{"train_lr": 0.0015891347672909151, "train_min_lr": 0.0015891347672909151, "train_loss": 0.6536750256394347, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.042221493111589015, "epoch": 182, "n_parameters": 93325440} +{"train_lr": 0.001579245432657976, "train_min_lr": 0.001579245432657976, "train_loss": 0.6545264580979561, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04499619110869483, "epoch": 183, "n_parameters": 93325440} +{"train_lr": 0.0015693275977863898, "train_min_lr": 0.0015693275977863898, "train_loss": 0.6535338635007159, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04249066217109943, "epoch": 184, "n_parameters": 93325440} +{"train_lr": 0.00155938201795768, "train_min_lr": 0.00155938201795768, "train_loss": 0.6535800552855318, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.043255524387439855, "epoch": 185, "n_parameters": 93325440} +{"train_lr": 0.0015494094505662558, "train_min_lr": 0.0015494094505662558, "train_loss": 0.6533823190256953, "train_loss_scale": 2271914.6666666665, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 186, "n_parameters": 93325440} +{"train_lr": 0.001539410655061736, "train_min_lr": 0.001539410655061736, "train_loss": 0.6532779107921016, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0433023767068218, "epoch": 187, "n_parameters": 93325440} +{"train_lr": 0.0015293863928911096, "train_min_lr": 0.0015293863928911096, "train_loss": 0.6531557398251234, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04270600716774471, "epoch": 188, "n_parameters": 93325440} +{"train_lr": 0.001519337427440752, "train_min_lr": 0.001519337427440752, "train_loss": 0.6530751259042284, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04218650396125248, "epoch": 189, "n_parameters": 93325440} +{"train_lr": 0.00150926452397829, "train_min_lr": 0.00150926452397829, "train_loss": 0.6529797476310378, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04319860042932515, "epoch": 190, "n_parameters": 93325440} +{"train_lr": 0.0014991684495943168, "train_min_lr": 0.0014991684495943168, "train_loss": 0.6529432415054777, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04291143520281483, "epoch": 191, "n_parameters": 93325440} +{"train_lr": 0.0014890499731439859, "train_min_lr": 0.0014890499731439859, "train_loss": 0.652965060244195, "train_loss_scale": 2251749.7435897435, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04314781410786777, "epoch": 192, "n_parameters": 93325440} +{"train_lr": 0.001478909865188459, "train_min_lr": 0.001478909865188459, "train_loss": 0.6527594692575244, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04327949065452394, "epoch": 193, "n_parameters": 93325440} +{"train_lr": 0.0014687488979362113, "train_min_lr": 0.0014687488979362113, "train_loss": 0.6527444154071884, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04356447886675596, "epoch": 194, "n_parameters": 93325440} +{"train_lr": 0.001458567845184241, "train_min_lr": 0.001458567845184241, "train_loss": 0.6526207220621216, "train_loss_scale": 2318966.153846154, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 195, "n_parameters": 93325440} +{"train_lr": 0.001448367482259133, "train_min_lr": 0.001448367482259133, "train_loss": 0.6524855274563799, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04391956604563464, "epoch": 196, "n_parameters": 93325440} +{"train_lr": 0.001438148585958014, "train_min_lr": 0.001438148585958014, "train_loss": 0.6524408043911442, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0437394251796202, "epoch": 197, "n_parameters": 93325440} +{"train_lr": 0.0014279119344894028, "train_min_lr": 0.0014279119344894028, "train_loss": 0.6523803126496764, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04416952813521792, "epoch": 198, "n_parameters": 93325440} +{"train_lr": 0.0014176583074139429, "train_min_lr": 0.0014176583074139429, "train_loss": 0.6521773194559873, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04388629415263541, "epoch": 199, "n_parameters": 93325440} +{"train_lr": 0.0014073884855850315, "train_min_lr": 0.0014073884855850315, "train_loss": 0.6522814940231351, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04478665374410458, "epoch": 200, "n_parameters": 93325440} +{"train_lr": 0.0013971032510893652, "train_min_lr": 0.0013971032510893652, "train_loss": 0.6521291283604044, "train_loss_scale": 3112119.794871795, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.044348080087309845, "epoch": 201, "n_parameters": 93325440} +{"train_lr": 0.0013868033871873699, "train_min_lr": 0.0013868033871873699, "train_loss": 0.6527649165155032, "train_loss_scale": 3454923.487179487, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 202, "n_parameters": 93325440} +{"train_lr": 0.0013764896782535606, "train_min_lr": 0.0013764896782535606, "train_loss": 0.6520171289642652, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.044557038258808926, "epoch": 203, "n_parameters": 93325440} +{"train_lr": 0.0013661629097168019, "train_min_lr": 0.0013661629097168019, "train_loss": 0.6519864696340684, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04660083410831598, "epoch": 204, "n_parameters": 93325440} +{"train_lr": 0.0013558238680005015, "train_min_lr": 0.0013558238680005015, "train_loss": 0.651847201733826, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04455802842186621, "epoch": 205, "n_parameters": 93325440} +{"train_lr": 0.0013454733404627138, "train_min_lr": 0.0013454733404627138, "train_loss": 0.6517544956400226, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04539844350149043, "epoch": 206, "n_parameters": 93325440} +{"train_lr": 0.0013351121153361868, "train_min_lr": 0.0013351121153361868, "train_loss": 0.6516532257008247, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.045580623366941624, "epoch": 207, "n_parameters": 93325440} +{"train_lr": 0.0013247409816683246, "train_min_lr": 0.0013247409816683246, "train_loss": 0.6515663202183369, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0457235167388064, "epoch": 208, "n_parameters": 93325440} +{"train_lr": 0.001314360729261115, "train_min_lr": 0.001314360729261115, "train_loss": 0.6514970767908753, "train_loss_scale": 4073314.4615384615, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04559553323838955, "epoch": 209, "n_parameters": 93325440} +{"train_lr": 0.0013039721486109636, "train_min_lr": 0.0013039721486109636, "train_loss": 0.6521160529018977, "train_loss_scale": 1626637.1282051282, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 210, "n_parameters": 93325440} +{"train_lr": 0.0012935760308485087, "train_min_lr": 0.0012935760308485087, "train_loss": 0.6513881812301966, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.045461809143232994, "epoch": 211, "n_parameters": 93325440} +{"train_lr": 0.0012831731676783689, "train_min_lr": 0.0012831731676783689, "train_loss": 0.6512881333772571, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04622284271037923, "epoch": 212, "n_parameters": 93325440} +{"train_lr": 0.001272764351318853, "train_min_lr": 0.001272764351318853, "train_loss": 0.6511788309241334, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04637779676331542, "epoch": 213, "n_parameters": 93325440} +{"train_lr": 0.0012623503744416213, "train_min_lr": 0.0012623503744416213, "train_loss": 0.6510865527372329, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04615227956300936, "epoch": 214, "n_parameters": 93325440} +{"train_lr": 0.0012519320301113358, "train_min_lr": 0.0012519320301113358, "train_loss": 0.6510655480699662, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.047730134501575656, "epoch": 215, "n_parameters": 93325440} +{"train_lr": 0.0012415101117252529, "train_min_lr": 0.0012415101117252529, "train_loss": 0.6508944522684965, "train_loss_scale": 1451874.4615384615, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.046214736675700314, "epoch": 216, "n_parameters": 93325440} +{"train_lr": 0.0012310854129528052, "train_min_lr": 0.0012310854129528052, "train_loss": 0.6508428633977206, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04702541534191905, "epoch": 217, "n_parameters": 93325440} +{"train_lr": 0.0012206587276751709, "train_min_lr": 0.0012206587276751709, "train_loss": 0.6506849839232671, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04694693328406757, "epoch": 218, "n_parameters": 93325440} +{"train_lr": 0.0012102308499247975, "train_min_lr": 0.0012102308499247975, "train_loss": 0.6506286191586883, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04696661863141717, "epoch": 219, "n_parameters": 93325440} +{"train_lr": 0.0011998025738249494, "train_min_lr": 0.0011998025738249494, "train_loss": 0.6505410681502559, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04692891295641088, "epoch": 220, "n_parameters": 93325440} +{"train_lr": 0.0011893746935292267, "train_min_lr": 0.0011893746935292267, "train_loss": 0.6505277044354723, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04882969319796524, "epoch": 221, "n_parameters": 93325440} +{"train_lr": 0.0011789480031610881, "train_min_lr": 0.0011789480031610881, "train_loss": 0.6503539036672848, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04787985252168698, "epoch": 222, "n_parameters": 93325440} +{"train_lr": 0.001168523296753375, "train_min_lr": 0.001168523296753375, "train_loss": 0.6502815783262635, "train_loss_scale": 4140530.871794872, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04777871564818689, "epoch": 223, "n_parameters": 93325440} +{"train_lr": 0.0011581013681878376, "train_min_lr": 0.0011581013681878376, "train_loss": 0.6502043705624647, "train_loss_scale": 3179336.205128205, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 224, "n_parameters": 93325440} +{"train_lr": 0.0011476830111346887, "train_min_lr": 0.0011476830111346887, "train_loss": 0.6501268409670163, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04823222842951042, "epoch": 225, "n_parameters": 93325440} +{"train_lr": 0.001137269018992153, "train_min_lr": 0.001137269018992153, "train_loss": 0.6499739350894322, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0483317183306775, "epoch": 226, "n_parameters": 93325440} +{"train_lr": 0.0011268601848260537, "train_min_lr": 0.0011268601848260537, "train_loss": 0.6509864589390464, "train_loss_scale": 1451874.4615384615, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 227, "n_parameters": 93325440} +{"train_lr": 0.0011164573013094073, "train_min_lr": 0.0011164573013094073, "train_loss": 0.6499788061930583, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04817305395427423, "epoch": 228, "n_parameters": 93325440} +{"train_lr": 0.001106061160662077, "train_min_lr": 0.001106061160662077, "train_loss": 0.6497716619274937, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04839964024722576, "epoch": 229, "n_parameters": 93325440} +{"train_lr": 0.0010956725545904166, "train_min_lr": 0.0010956725545904166, "train_loss": 0.6496301181136798, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04950273199341236, "epoch": 230, "n_parameters": 93325440} +{"train_lr": 0.0010852922742270053, "train_min_lr": 0.0010852922742270053, "train_loss": 0.6495860489801719, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.049837018661678605, "epoch": 231, "n_parameters": 93325440} +{"train_lr": 0.0010749211100703794, "train_min_lr": 0.0010749211100703794, "train_loss": 0.6494765863395654, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.049607541806136184, "epoch": 232, "n_parameters": 93325440} +{"train_lr": 0.0010645598519248383, "train_min_lr": 0.0010645598519248383, "train_loss": 0.6494915124076681, "train_loss_scale": 1263668.5128205128, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04930285058724575, "epoch": 233, "n_parameters": 93325440} +{"train_lr": 0.0010542092888403117, "train_min_lr": 0.0010542092888403117, "train_loss": 0.6492499822798448, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05036587239458011, "epoch": 234, "n_parameters": 93325440} +{"train_lr": 0.0010438702090522496, "train_min_lr": 0.0010438702090522496, "train_loss": 0.6491830785018511, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.04942306615889837, "epoch": 235, "n_parameters": 93325440} +{"train_lr": 0.001033543399921608, "train_min_lr": 0.001033543399921608, "train_loss": 0.6491319514476718, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05040391029503483, "epoch": 236, "n_parameters": 93325440} +{"train_lr": 0.001023229647874884, "train_min_lr": 0.001023229647874884, "train_loss": 0.6491064585697575, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05058605785672672, "epoch": 237, "n_parameters": 93325440} +{"train_lr": 0.0010129297383442272, "train_min_lr": 0.0010129297383442272, "train_loss": 0.6488493490868654, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05047347367359086, "epoch": 238, "n_parameters": 93325440} +{"train_lr": 0.0010026444557076238, "train_min_lr": 0.0010026444557076238, "train_loss": 0.6489043886988208, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05205608325270124, "epoch": 239, "n_parameters": 93325440} +{"train_lr": 0.000992374583229171, "train_min_lr": 0.000992374583229171, "train_loss": 0.6487640984451923, "train_loss_scale": 3764118.9743589745, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0509271570123159, "epoch": 240, "n_parameters": 93325440} +{"train_lr": 0.0009821209029994167, "train_min_lr": 0.0009821209029994167, "train_loss": 0.6486013716516587, "train_loss_scale": 2520615.3846153845, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 241, "n_parameters": 93325440} +{"train_lr": 0.0009718841958758109, "train_min_lr": 0.0009718841958758109, "train_loss": 0.6486407348360771, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05130887303787928, "epoch": 242, "n_parameters": 93325440} +{"train_lr": 0.0009616652414232358, "train_min_lr": 0.0009616652414232358, "train_loss": 0.6486426391758215, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05432937994527702, "epoch": 243, "n_parameters": 93325440} +{"train_lr": 0.0009514648178546331, "train_min_lr": 0.0009514648178546331, "train_loss": 0.648351379407522, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.051419835310811415, "epoch": 244, "n_parameters": 93325440} +{"train_lr": 0.0009412837019717529, "train_min_lr": 0.0009412837019717529, "train_loss": 0.6482436602982955, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05215922709649954, "epoch": 245, "n_parameters": 93325440} +{"train_lr": 0.0009311226691059865, "train_min_lr": 0.0009311226691059865, "train_loss": 0.6480796076834966, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05242994439621002, "epoch": 246, "n_parameters": 93325440} +{"train_lr": 0.0009209824930593261, "train_min_lr": 0.0009209824930593261, "train_loss": 0.6481469419952004, "train_loss_scale": 2910470.564102564, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.053209778387099504, "epoch": 247, "n_parameters": 93325440} +{"train_lr": 0.0009108639460454382, "train_min_lr": 0.0009108639460454382, "train_loss": 0.6480937603956614, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05251093531170717, "epoch": 248, "n_parameters": 93325440} +{"train_lr": 0.0009007677986308538, "train_min_lr": 0.0009007677986308538, "train_loss": 0.6479733674667585, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05371020100294398, "epoch": 249, "n_parameters": 93325440} +{"train_lr": 0.0008906948196762859, "train_min_lr": 0.0008906948196762859, "train_loss": 0.6478110108858882, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05340170722383146, "epoch": 250, "n_parameters": 93325440} +{"train_lr": 0.000880645776278082, "train_min_lr": 0.000880645776278082, "train_loss": 0.6477152522510061, "train_loss_scale": 4012819.6923076925, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 251, "n_parameters": 93325440} +{"train_lr": 0.000870621433709802, "train_min_lr": 0.000870621433709802, "train_loss": 0.647687696135388, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.053430883751179166, "epoch": 252, "n_parameters": 93325440} +{"train_lr": 0.0008606225553639452, "train_min_lr": 0.0008606225553639452, "train_loss": 0.6475503460193673, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05456276907800482, "epoch": 253, "n_parameters": 93325440} +{"train_lr": 0.0008506499026938082, "train_min_lr": 0.0008506499026938082, "train_loss": 0.647404620041832, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.053933491894545466, "epoch": 254, "n_parameters": 93325440} +{"train_lr": 0.0008407042351555041, "train_min_lr": 0.0008407042351555041, "train_loss": 0.6473850671631786, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.054598415115227304, "epoch": 255, "n_parameters": 93325440} +{"train_lr": 0.0008307863101501201, "train_min_lr": 0.0008307863101501201, "train_loss": 0.6478753392942823, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05803065253899266, "epoch": 256, "n_parameters": 93325440} +{"train_lr": 0.0008208968829660467, "train_min_lr": 0.0008208968829660467, "train_loss": 0.6472325039645418, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05549604690466554, "epoch": 257, "n_parameters": 93325440} +{"train_lr": 0.0008110367067214505, "train_min_lr": 0.0008110367067214505, "train_loss": 0.6470743234579762, "train_loss_scale": 3515418.2564102565, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05561789496539113, "epoch": 258, "n_parameters": 93325440} +{"train_lr": 0.0008012065323069282, "train_min_lr": 0.0008012065323069282, "train_loss": 0.6470537870549239, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05580556814152843, "epoch": 259, "n_parameters": 93325440} +{"train_lr": 0.0007914071083283216, "train_min_lr": 0.0007914071083283216, "train_loss": 0.6468538586050272, "train_loss_scale": 3071789.9487179485, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 260, "n_parameters": 93325440} +{"train_lr": 0.0007816391810497043, "train_min_lr": 0.0007816391810497043, "train_loss": 0.646771350899377, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0561039160268429, "epoch": 261, "n_parameters": 93325440} +{"train_lr": 0.0007719034943365599, "train_min_lr": 0.0007719034943365599, "train_loss": 0.6466650695611651, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05663711445119519, "epoch": 262, "n_parameters": 93325440} +{"train_lr": 0.0007622007895991216, "train_min_lr": 0.0007622007895991216, "train_loss": 0.646612029761458, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05639886655486547, "epoch": 263, "n_parameters": 93325440} +{"train_lr": 0.0007525318057359233, "train_min_lr": 0.0007525318057359233, "train_loss": 0.6464776289529908, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05716049529086703, "epoch": 264, "n_parameters": 93325440} +{"train_lr": 0.0007428972790775184, "train_min_lr": 0.0007428972790775184, "train_loss": 0.6463966291063489, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05726632617939359, "epoch": 265, "n_parameters": 93325440} +{"train_lr": 0.0007332979433304174, "train_min_lr": 0.0007332979433304174, "train_loss": 0.6463383005406612, "train_loss_scale": 2359296.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05750849833473181, "epoch": 266, "n_parameters": 93325440} +{"train_lr": 0.0007237345295211991, "train_min_lr": 0.0007237345295211991, "train_loss": 0.6462797559797764, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0577031647762618, "epoch": 267, "n_parameters": 93325440} +{"train_lr": 0.0007142077659408527, "train_min_lr": 0.0007142077659408527, "train_loss": 0.6460776025524888, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05781100867077326, "epoch": 268, "n_parameters": 93325440} +{"train_lr": 0.0007047183780893101, "train_min_lr": 0.0007047183780893101, "train_loss": 0.6460817293622173, "train_loss_scale": 3522139.8974358975, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 269, "n_parameters": 93325440} +{"train_lr": 0.0006952670886201941, "train_min_lr": 0.0006952670886201941, "train_loss": 0.6458973227164302, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05810472993657757, "epoch": 270, "n_parameters": 93325440} +{"train_lr": 0.0006858546172857918, "train_min_lr": 0.0006858546172857918, "train_loss": 0.6459075553008379, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05934882303699851, "epoch": 271, "n_parameters": 93325440} +{"train_lr": 0.0006764816808822353, "train_min_lr": 0.0006764816808822353, "train_loss": 0.645777548233477, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.058964002722253404, "epoch": 272, "n_parameters": 93325440} +{"train_lr": 0.0006671489931949224, "train_min_lr": 0.0006671489931949224, "train_loss": 0.6455980250850702, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0593459837926695, "epoch": 273, "n_parameters": 93325440} +{"train_lr": 0.000657857264944153, "train_min_lr": 0.000657857264944153, "train_loss": 0.6455329716062317, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.05996235124528026, "epoch": 274, "n_parameters": 93325440} +{"train_lr": 0.0006486072037310055, "train_min_lr": 0.0006486072037310055, "train_loss": 0.6459870672760866, "train_loss_scale": 1539255.7948717948, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 275, "n_parameters": 93325440} +{"train_lr": 0.0006393995139834575, "train_min_lr": 0.0006393995139834575, "train_loss": 0.6454331303875034, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06105376998535716, "epoch": 276, "n_parameters": 93325440} +{"train_lr": 0.0006302348969027304, "train_min_lr": 0.0006302348969027304, "train_loss": 0.6450987507660801, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06070478687970302, "epoch": 277, "n_parameters": 93325440} +{"train_lr": 0.0006211140504098989, "train_min_lr": 0.0006211140504098989, "train_loss": 0.6451377625314471, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06072737142825738, "epoch": 278, "n_parameters": 93325440} +{"train_lr": 0.0006120376690927338, "train_min_lr": 0.0006120376690927338, "train_loss": 0.6450422000951874, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.061695305057443105, "epoch": 279, "n_parameters": 93325440} +{"train_lr": 0.0006030064441528148, "train_min_lr": 0.0006030064441528148, "train_loss": 0.6448882166296244, "train_loss_scale": 1048576.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.061000630307273984, "epoch": 280, "n_parameters": 93325440} +{"train_lr": 0.0005940210633528858, "train_min_lr": 0.0005940210633528858, "train_loss": 0.6448262258408926, "train_loss_scale": 1176287.1794871795, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06153951602008862, "epoch": 281, "n_parameters": 93325440} +{"train_lr": 0.0005850822109644842, "train_min_lr": 0.0005850822109644842, "train_loss": 0.6445927659574991, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.062148432581661604, "epoch": 282, "n_parameters": 93325440} +{"train_lr": 0.0005761905677158267, "train_min_lr": 0.0005761905677158267, "train_loss": 0.6445580886629148, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06258190625036757, "epoch": 283, "n_parameters": 93325440} +{"train_lr": 0.0005673468107399736, "train_min_lr": 0.0005673468107399736, "train_loss": 0.6444872012361884, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06241093659534668, "epoch": 284, "n_parameters": 93325440} +{"train_lr": 0.0005585516135232553, "train_min_lr": 0.0005585516135232553, "train_loss": 0.6443635298607823, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06303226940620404, "epoch": 285, "n_parameters": 93325440} +{"train_lr": 0.0005498056458539954, "train_min_lr": 0.0005498056458539954, "train_loss": 0.6442393993911071, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06352718579224668, "epoch": 286, "n_parameters": 93325440} +{"train_lr": 0.0005411095737714909, "train_min_lr": 0.0005411095737714909, "train_loss": 0.6442402946309019, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06359702980336852, "epoch": 287, "n_parameters": 93325440} +{"train_lr": 0.0005324640595153003, "train_min_lr": 0.0005324640595153003, "train_loss": 0.6440057540073608, "train_loss_scale": 3589356.3076923075, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06370922606677198, "epoch": 288, "n_parameters": 93325440} +{"train_lr": 0.0005238697614748063, "train_min_lr": 0.0005238697614748063, "train_loss": 0.643923002987718, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06471771006591809, "epoch": 289, "n_parameters": 93325440} +{"train_lr": 0.0005153273341390795, "train_min_lr": 0.0005153273341390795, "train_loss": 0.6438843237761503, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06490508722475706, "epoch": 290, "n_parameters": 93325440} +{"train_lr": 0.0005068374280470331, "train_min_lr": 0.0005068374280470331, "train_loss": 0.6437251456320668, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06503680388992414, "epoch": 291, "n_parameters": 93325440} +{"train_lr": 0.0004984006897378886, "train_min_lr": 0.0004984006897378886, "train_loss": 0.6435524815311416, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06528743010205336, "epoch": 292, "n_parameters": 93325440} +{"train_lr": 0.0004900177617019307, "train_min_lr": 0.0004900177617019307, "train_loss": 0.6435383766507491, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06551545427347986, "epoch": 293, "n_parameters": 93325440} +{"train_lr": 0.00048168928233158545, "train_min_lr": 0.00048168928233158545, "train_loss": 0.6435012809024789, "train_loss_scale": 4207747.282051282, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 294, "n_parameters": 93325440} +{"train_lr": 0.00047341588587280147, "train_min_lr": 0.00047341588587280147, "train_loss": 0.6432838347764351, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06607394509065227, "epoch": 295, "n_parameters": 93325440} +{"train_lr": 0.00046519820237675105, "train_min_lr": 0.00046519820237675105, "train_loss": 0.6431936899152322, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06693988484449875, "epoch": 296, "n_parameters": 93325440} +{"train_lr": 0.0004570368576518498, "train_min_lr": 0.0004570368576518498, "train_loss": 0.6430653700222954, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06650815351316944, "epoch": 297, "n_parameters": 93325440} +{"train_lr": 0.00044893247321609476, "train_min_lr": 0.00044893247321609476, "train_loss": 0.6430479057180958, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06812319378010355, "epoch": 298, "n_parameters": 93325440} +{"train_lr": 0.0004408856662497389, "train_min_lr": 0.0004408856662497389, "train_loss": 0.6429014222648664, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06764771367829198, "epoch": 299, "n_parameters": 93325440} +{"train_lr": 0.00043289704954828676, "train_min_lr": 0.00043289704954828676, "train_loss": 0.6429314889157047, "train_loss_scale": 3522139.8974358975, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 300, "n_parameters": 93325440} +{"train_lr": 0.0004249672314758303, "train_min_lr": 0.0004249672314758303, "train_loss": 0.642660735700375, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06784943320478003, "epoch": 301, "n_parameters": 93325440} +{"train_lr": 0.0004170968159187159, "train_min_lr": 0.0004170968159187159, "train_loss": 0.6425696528779391, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06832408513396214, "epoch": 302, "n_parameters": 93325440} +{"train_lr": 0.0004092864022395612, "train_min_lr": 0.0004092864022395612, "train_loss": 0.6424364460488925, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06891751661896706, "epoch": 303, "n_parameters": 93325440} +{"train_lr": 0.00040153658523160577, "train_min_lr": 0.00040153658523160577, "train_loss": 0.6424073156876824, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06886650525176755, "epoch": 304, "n_parameters": 93325440} +{"train_lr": 0.0003938479550734206, "train_min_lr": 0.0003938479550734206, "train_loss": 0.6422329479828477, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06983281817669287, "epoch": 305, "n_parameters": 93325440} +{"train_lr": 0.0003862210972839593, "train_min_lr": 0.0003862210972839593, "train_loss": 0.6421345776806657, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.069272606060482, "epoch": 306, "n_parameters": 93325440} +{"train_lr": 0.00037865659267797083, "train_min_lr": 0.00037865659267797083, "train_loss": 0.6420191806764939, "train_loss_scale": 4006098.0512820515, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07026870318282491, "epoch": 307, "n_parameters": 93325440} +{"train_lr": 0.0003711550173217691, "train_min_lr": 0.0003711550173217691, "train_loss": 0.6419261168831816, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07061367023449677, "epoch": 308, "n_parameters": 93325440} +{"train_lr": 0.00036371694248936003, "train_min_lr": 0.00036371694248936003, "train_loss": 0.6419294705518928, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07054317421399248, "epoch": 309, "n_parameters": 93325440} +{"train_lr": 0.00035634293461894045, "train_min_lr": 0.00035634293461894045, "train_loss": 0.6417247991149242, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07073189578472804, "epoch": 310, "n_parameters": 93325440} +{"train_lr": 0.00034903355526975867, "train_min_lr": 0.00034903355526975867, "train_loss": 0.6417726809158921, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07111873459787323, "epoch": 311, "n_parameters": 93325440} +{"train_lr": 0.00034178936107935213, "train_min_lr": 0.00034178936107935213, "train_loss": 0.6415677572576663, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07161029661074281, "epoch": 312, "n_parameters": 93325440} +{"train_lr": 0.00033461090372115536, "train_min_lr": 0.00033461090372115536, "train_loss": 0.641492684252369, "train_loss_scale": 4234633.846153846, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 313, "n_parameters": 93325440} +{"train_lr": 0.0003274987298624889, "train_min_lr": 0.0003274987298624889, "train_loss": 0.6412481918739967, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07331909517494914, "epoch": 314, "n_parameters": 93325440} +{"train_lr": 0.0003204533811229274, "train_min_lr": 0.0003204533811229274, "train_loss": 0.6412237585307314, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07187579928013758, "epoch": 315, "n_parameters": 93325440} +{"train_lr": 0.0003134753940330548, "train_min_lr": 0.0003134753940330548, "train_loss": 0.6410378232025183, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07243024513650781, "epoch": 316, "n_parameters": 93325440} +{"train_lr": 0.00030656529999360446, "train_min_lr": 0.00030656529999360446, "train_loss": 0.6408698111533736, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07311062428813714, "epoch": 317, "n_parameters": 93325440} +{"train_lr": 0.00029972362523499117, "train_min_lr": 0.00029972362523499117, "train_loss": 0.6409426364713372, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07305406825616956, "epoch": 318, "n_parameters": 93325440} +{"train_lr": 0.00029295089077723615, "train_min_lr": 0.00029295089077723615, "train_loss": 0.6407700431031677, "train_loss_scale": 4207747.282051282, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 319, "n_parameters": 93325440} +{"train_lr": 0.0002862476123902899, "train_min_lr": 0.0002862476123902899, "train_loss": 0.6407054029643918, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0735805794620552, "epoch": 320, "n_parameters": 93325440} +{"train_lr": 0.00027961430055475504, "train_min_lr": 0.00027961430055475504, "train_loss": 0.6406478409488232, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07441432332285704, "epoch": 321, "n_parameters": 93325440} +{"train_lr": 0.00027305146042300914, "train_min_lr": 0.00027305146042300914, "train_loss": 0.640592068959123, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07498312365406981, "epoch": 322, "n_parameters": 93325440} +{"train_lr": 0.00026655959178073735, "train_min_lr": 0.00026655959178073735, "train_loss": 0.6404183830062931, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07469649117392226, "epoch": 323, "n_parameters": 93325440} +{"train_lr": 0.00026013918900887165, "train_min_lr": 0.00026013918900887165, "train_loss": 0.6402167561344612, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07498014215618753, "epoch": 324, "n_parameters": 93325440} +{"train_lr": 0.00025379074104594005, "train_min_lr": 0.00025379074104594005, "train_loss": 0.6401908083412892, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0752872514944428, "epoch": 325, "n_parameters": 93325440} +{"train_lr": 0.00024751473135083417, "train_min_lr": 0.00024751473135083417, "train_loss": 0.6400560573555338, "train_loss_scale": 4274963.692307692, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 326, "n_parameters": 93325440} +{"train_lr": 0.00024131163786599068, "train_min_lr": 0.00024131163786599068, "train_loss": 0.639981580277284, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07499027719052556, "epoch": 327, "n_parameters": 93325440} +{"train_lr": 0.00023518193298099495, "train_min_lr": 0.00023518193298099495, "train_loss": 0.6398042239821874, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07650711690672697, "epoch": 328, "n_parameters": 93325440} +{"train_lr": 0.00022912608349660648, "train_min_lr": 0.00022912608349660648, "train_loss": 0.6398056661471342, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07644278692224851, "epoch": 329, "n_parameters": 93325440} +{"train_lr": 0.0002231445505892088, "train_min_lr": 0.0002231445505892088, "train_loss": 0.6396185159205626, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07728877522719976, "epoch": 330, "n_parameters": 93325440} +{"train_lr": 0.00021723778977569177, "train_min_lr": 0.00021723778977569177, "train_loss": 0.6396183661925487, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07660753896030095, "epoch": 331, "n_parameters": 93325440} +{"train_lr": 0.00021140625087876029, "train_min_lr": 0.00021140625087876029, "train_loss": 0.6395186086973319, "train_loss_scale": 4328736.820512821, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 332, "n_parameters": 93325440} +{"train_lr": 0.0002056503779926791, "train_min_lr": 0.0002056503779926791, "train_loss": 0.6393728998418038, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07701161621998136, "epoch": 333, "n_parameters": 93325440} +{"train_lr": 0.00019997060944945298, "train_min_lr": 0.00019997060944945298, "train_loss": 0.6392656035291461, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07688560603090967, "epoch": 334, "n_parameters": 93325440} +{"train_lr": 0.00019436737778544695, "train_min_lr": 0.00019436737778544695, "train_loss": 0.6391785949802934, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07801455034850499, "epoch": 335, "n_parameters": 93325440} +{"train_lr": 0.00018884110970844584, "train_min_lr": 0.00018884110970844584, "train_loss": 0.6390720842979275, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07811531333778149, "epoch": 336, "n_parameters": 93325440} +{"train_lr": 0.00018339222606515945, "train_min_lr": 0.00018339222606515945, "train_loss": 0.6390230263559482, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07809111967873879, "epoch": 337, "n_parameters": 93325440} +{"train_lr": 0.00017802114180917348, "train_min_lr": 0.00017802114180917348, "train_loss": 0.6388136494952517, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07835216268610495, "epoch": 338, "n_parameters": 93325440} +{"train_lr": 0.00017272826596934892, "train_min_lr": 0.00017272826596934892, "train_loss": 0.6388165717705702, "train_loss_scale": 4234633.846153846, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 339, "n_parameters": 93325440} +{"train_lr": 0.00016751400161867366, "train_min_lr": 0.00016751400161867366, "train_loss": 0.638698265553476, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07814501840860033, "epoch": 340, "n_parameters": 93325440} +{"train_lr": 0.00016237874584356537, "train_min_lr": 0.00016237874584356537, "train_loss": 0.6386319290225705, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07872966158753023, "epoch": 341, "n_parameters": 93325440} +{"train_lr": 0.00015732288971363333, "train_min_lr": 0.00015732288971363333, "train_loss": 0.6385865815652486, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07859047117810218, "epoch": 342, "n_parameters": 93325440} +{"train_lr": 0.00015234681825189645, "train_min_lr": 0.00015234681825189645, "train_loss": 0.63845045119524, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.079885441212891, "epoch": 343, "n_parameters": 93325440} +{"train_lr": 0.0001474509104054623, "train_min_lr": 0.0001474509104054623, "train_loss": 0.6383905794519262, "train_loss_scale": 4113644.3076923075, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 344, "n_parameters": 93325440} +{"train_lr": 0.00014263553901666846, "train_min_lr": 0.00014263553901666846, "train_loss": 0.6382861136673734, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07959201844791189, "epoch": 345, "n_parameters": 93325440} +{"train_lr": 0.00013790107079468978, "train_min_lr": 0.00013790107079468978, "train_loss": 0.6382117107367287, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07981004401181753, "epoch": 346, "n_parameters": 93325440} +{"train_lr": 0.00013324786628761168, "train_min_lr": 0.00013324786628761168, "train_loss": 0.6381998471915722, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07895925439273317, "epoch": 347, "n_parameters": 93325440} +{"train_lr": 0.00012867627985497265, "train_min_lr": 0.00012867627985497265, "train_loss": 0.6379489563644315, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07938735000789165, "epoch": 348, "n_parameters": 93325440} +{"train_lr": 0.00012418665964077964, "train_min_lr": 0.00012418665964077964, "train_loss": 0.637967341245176, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07966416738688564, "epoch": 349, "n_parameters": 93325440} +{"train_lr": 0.00011977934754699389, "train_min_lr": 0.00011977934754699389, "train_loss": 0.6379127469009314, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07942784486863858, "epoch": 350, "n_parameters": 93325440} +{"train_lr": 0.00011545467920749486, "train_min_lr": 0.00011545467920749486, "train_loss": 0.6377013552790651, "train_loss_scale": 3414593.641025641, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07806967773164313, "epoch": 351, "n_parameters": 93325440} +{"train_lr": 0.00011121298396252068, "train_min_lr": 0.00011121298396252068, "train_loss": 0.6377180949665415, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07993513080649652, "epoch": 352, "n_parameters": 93325440} +{"train_lr": 0.00010705458483358618, "train_min_lr": 0.00010705458483358618, "train_loss": 0.637548968124275, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07938833866650477, "epoch": 353, "n_parameters": 93325440} +{"train_lr": 0.00010297979849888524, "train_min_lr": 0.00010297979849888524, "train_loss": 0.6374551640489162, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.08055205480792584, "epoch": 354, "n_parameters": 93325440} +{"train_lr": 9.89889352691732e-05, "train_min_lr": 9.89889352691732e-05, "train_loss": 0.6374118199380927, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07887526011715333, "epoch": 355, "n_parameters": 93325440} +{"train_lr": 9.508229906413639e-05, "train_min_lr": 9.508229906413639e-05, "train_loss": 0.6373603121879009, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07952352477094302, "epoch": 356, "n_parameters": 93325440} +{"train_lr": 9.126018738924708e-05, "train_min_lr": 9.126018738924708e-05, "train_loss": 0.6372977341405857, "train_loss_scale": 4301850.256410256, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 357, "n_parameters": 93325440} +{"train_lr": 8.752289131310686e-05, "train_min_lr": 8.752289131310686e-05, "train_loss": 0.6372137449633999, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07873909236290134, "epoch": 358, "n_parameters": 93325440} +{"train_lr": 8.387069544528183e-05, "train_min_lr": 8.387069544528183e-05, "train_loss": 0.6371395650486915, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.08040461130440235, "epoch": 359, "n_parameters": 93325440} +{"train_lr": 8.030387791462727e-05, "train_min_lr": 8.030387791462727e-05, "train_loss": 0.6370332259923602, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07940743340609166, "epoch": 360, "n_parameters": 93325440} +{"train_lr": 7.682271034810752e-05, "train_min_lr": 7.682271034810752e-05, "train_loss": 0.6370450413475434, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07772449648771913, "epoch": 361, "n_parameters": 93325440} +{"train_lr": 7.342745785011076e-05, "train_min_lr": 7.342745785011076e-05, "train_loss": 0.6370495901657984, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07786894112061231, "epoch": 362, "n_parameters": 93325440} +{"train_lr": 7.011837898225992e-05, "train_min_lr": 7.011837898225992e-05, "train_loss": 0.6369192731590607, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07879469144898348, "epoch": 363, "n_parameters": 93325440} +{"train_lr": 6.689572574372245e-05, "train_min_lr": 6.689572574372245e-05, "train_loss": 0.6368134421511338, "train_loss_scale": 4221190.564102564, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 364, "n_parameters": 93325440} +{"train_lr": 6.375974355201949e-05, "train_min_lr": 6.375974355201949e-05, "train_loss": 0.6368455880632004, "train_loss_scale": 2910470.564102564, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 365, "n_parameters": 93325440} +{"train_lr": 6.0710671224336305e-05, "train_min_lr": 6.0710671224336305e-05, "train_loss": 0.6366940776411539, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07782639305178936, "epoch": 366, "n_parameters": 93325440} +{"train_lr": 5.774874095933571e-05, "train_min_lr": 5.774874095933571e-05, "train_loss": 0.6366343214295995, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07853440878291924, "epoch": 367, "n_parameters": 93325440} +{"train_lr": 5.487417831947492e-05, "train_min_lr": 5.487417831947492e-05, "train_loss": 0.6365780937843598, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0772015988970032, "epoch": 368, "n_parameters": 93325440} +{"train_lr": 5.208720221382823e-05, "train_min_lr": 5.208720221382823e-05, "train_loss": 0.6364671008136028, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07735530239267227, "epoch": 369, "n_parameters": 93325440} +{"train_lr": 4.938802488141633e-05, "train_min_lr": 4.938802488141633e-05, "train_loss": 0.6364298100368335, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07671566227546488, "epoch": 370, "n_parameters": 93325440} +{"train_lr": 4.677685187504342e-05, "train_min_lr": 4.677685187504342e-05, "train_loss": 0.6363930054104481, "train_loss_scale": 2520615.3846153845, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07729287096896233, "epoch": 371, "n_parameters": 93325440} +{"train_lr": 4.4253882045643506e-05, "train_min_lr": 4.4253882045643506e-05, "train_loss": 0.6362656567675563, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07769798796671705, "epoch": 372, "n_parameters": 93325440} +{"train_lr": 4.18193075271371e-05, "train_min_lr": 4.18193075271371e-05, "train_loss": 0.6362442963589461, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07702068207212366, "epoch": 373, "n_parameters": 93325440} +{"train_lr": 3.947331372179967e-05, "train_min_lr": 3.947331372179967e-05, "train_loss": 0.6362754225444335, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07484106043687998, "epoch": 374, "n_parameters": 93325440} +{"train_lr": 3.7216079286142414e-05, "train_min_lr": 3.7216079286142414e-05, "train_loss": 0.6361566183324425, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0760205382337937, "epoch": 375, "n_parameters": 93325440} +{"train_lr": 3.5047776117306726e-05, "train_min_lr": 3.5047776117306726e-05, "train_loss": 0.6362235399249655, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07602877177011508, "epoch": 376, "n_parameters": 93325440} +{"train_lr": 3.296856933997393e-05, "train_min_lr": 3.296856933997393e-05, "train_loss": 0.635972078746328, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07373363789744102, "epoch": 377, "n_parameters": 93325440} +{"train_lr": 3.097861729379017e-05, "train_min_lr": 3.097861729379017e-05, "train_loss": 0.6361510580023512, "train_loss_scale": 4234633.846153846, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 378, "n_parameters": 93325440} +{"train_lr": 2.9078071521308036e-05, "train_min_lr": 2.9078071521308036e-05, "train_loss": 0.63601395005408, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07353063681139013, "epoch": 379, "n_parameters": 93325440} +{"train_lr": 2.726707675644639e-05, "train_min_lr": 2.726707675644639e-05, "train_loss": 0.6359621320063105, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07359813988352051, "epoch": 380, "n_parameters": 93325440} +{"train_lr": 2.5545770913468177e-05, "train_min_lr": 2.5545770913468177e-05, "train_loss": 0.6358990157023072, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07391278346618399, "epoch": 381, "n_parameters": 93325440} +{"train_lr": 2.3914285076477597e-05, "train_min_lr": 2.3914285076477597e-05, "train_loss": 0.6358039946987842, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07293483195826411, "epoch": 382, "n_parameters": 93325440} +{"train_lr": 2.2372743489437732e-05, "train_min_lr": 2.2372743489437732e-05, "train_loss": 0.635828505962705, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0730095375329256, "epoch": 383, "n_parameters": 93325440} +{"train_lr": 2.0921263546708787e-05, "train_min_lr": 2.0921263546708787e-05, "train_loss": 0.6358541949914817, "train_loss_scale": 4274963.692307692, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 384, "n_parameters": 93325440} +{"train_lr": 1.9559955784107943e-05, "train_min_lr": 1.9559955784107943e-05, "train_loss": 0.6357775855427369, "train_loss_scale": 2392904.205128205, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 385, "n_parameters": 93325440} +{"train_lr": 1.8288923870491908e-05, "train_min_lr": 1.8288923870491908e-05, "train_loss": 0.6357611800090243, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0697251740986338, "epoch": 386, "n_parameters": 93325440} +{"train_lr": 1.7108264599861837e-05, "train_min_lr": 1.7108264599861837e-05, "train_loss": 0.6357341195241764, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.07034617092890236, "epoch": 387, "n_parameters": 93325440} +{"train_lr": 1.6018067883992388e-05, "train_min_lr": 1.6018067883992388e-05, "train_loss": 0.6357648971323402, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06915067615083013, "epoch": 388, "n_parameters": 93325440} +{"train_lr": 1.5018416745584281e-05, "train_min_lr": 1.5018416745584281e-05, "train_loss": 0.6355957043810915, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06922066757359947, "epoch": 389, "n_parameters": 93325440} +{"train_lr": 1.4109387311942033e-05, "train_min_lr": 1.4109387311942033e-05, "train_loss": 0.6356340470270087, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06942802757168046, "epoch": 390, "n_parameters": 93325440} +{"train_lr": 1.3291048809176455e-05, "train_min_lr": 1.3291048809176455e-05, "train_loss": 0.6357145799944798, "train_loss_scale": 3038181.7435897435, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0692412858733382, "epoch": 391, "n_parameters": 93325440} +{"train_lr": 1.2563463556932867e-05, "train_min_lr": 1.2563463556932867e-05, "train_loss": 0.6356194600558434, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06838465781691365, "epoch": 392, "n_parameters": 93325440} +{"train_lr": 1.1926686963645178e-05, "train_min_lr": 1.1926686963645178e-05, "train_loss": 0.6356445192717589, "train_loss_scale": 4194304.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0678466994387026, "epoch": 393, "n_parameters": 93325440} +{"train_lr": 1.1380767522316361e-05, "train_min_lr": 1.1380767522316361e-05, "train_loss": 0.6355957909702108, "train_loss_scale": 3690180.923076923, "train_weight_decay": 0.050000000000000266, "train_grad_norm": Infinity, "epoch": 394, "n_parameters": 93325440} +{"train_lr": 1.0925746806825483e-05, "train_min_lr": 1.0925746806825483e-05, "train_loss": 0.63564810082794, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06712529549184136, "epoch": 395, "n_parameters": 93325440} +{"train_lr": 1.0561659468761706e-05, "train_min_lr": 1.0561659468761706e-05, "train_loss": 0.6356388856298648, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06706462644088344, "epoch": 396, "n_parameters": 93325440} +{"train_lr": 1.0288533234785454e-05, "train_min_lr": 1.0288533234785454e-05, "train_loss": 0.6356670448126701, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06805484952070774, "epoch": 397, "n_parameters": 93325440} +{"train_lr": 1.0106388904516887e-05, "train_min_lr": 1.0106388904516887e-05, "train_loss": 0.6356371932734663, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.0677952241534606, "epoch": 398, "n_parameters": 93325440} +{"train_lr": 1.0015240348951963e-05, "train_min_lr": 1.0015240348951963e-05, "train_loss": 0.6355630416327562, "train_loss_scale": 2097152.0, "train_weight_decay": 0.050000000000000266, "train_grad_norm": 0.06769333966076374, "epoch": 399, "n_parameters": 93325440} diff --git a/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_base_0.75_400e_finetune_100e.txt b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_base_0.75_400e_finetune_100e.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4e33adfae2f2bba6fd3f9c952a5022f30da00d0 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_base_0.75_400e_finetune_100e.txt @@ -0,0 +1,100 @@ +{"train_lr": 0.0003997441637352095, "train_min_lr": 9.49682763753549e-06, "train_loss": 5.592792673719873, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.9024732707025145, "test_loss": 2.250357142452038, "test_acc1": 49.00800136184692, "test_acc5": 77.24600258117675, "epoch": 0, "n_parameters": 86406376} +{"train_lr": 0.001199872081867605, "train_min_lr": 2.8505677836826516e-05, "train_loss": 4.72904204533731, "train_loss_scale": 91886.60591526779, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 2.1820284749487704, "test_loss": 1.8586196506565267, "test_acc1": 57.59000160797119, "test_acc5": 83.0080024029541, "epoch": 1, "n_parameters": 86406376} +{"train_lr": 0.0019999999999999996, "train_min_lr": 4.751452803611757e-05, "train_loss": 4.511791644002989, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.9337663032072816, "test_loss": 1.7343718714334748, "test_acc1": 60.788001712646484, "test_acc5": 84.77000248535157, "epoch": 2, "n_parameters": 86406376} +{"train_lr": 0.002800127918132395, "train_min_lr": 6.65233782354086e-05, "train_loss": 4.412802945819499, "train_loss_scale": 165647.34772182253, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.6765434082710382, "test_acc1": 61.76400176208496, "test_acc5": 85.60000259338379, "epoch": 3, "n_parameters": 86406376} +{"train_lr": 0.0036002558362647923, "train_min_lr": 8.553222843469963e-05, "train_loss": 4.3487851610906025, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5445282463070682, "test_loss": 1.6564826540874713, "test_acc1": 62.59200166107178, "test_acc5": 86.34200263458251, "epoch": 4, "n_parameters": 86406376} +{"train_lr": 0.00399963601955247, "train_min_lr": 9.502040889264574e-05, "train_loss": 4.273726266303318, "train_loss_scale": 254390.74020783373, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4550523775086985, "test_loss": 1.6366071384964567, "test_acc1": 63.276001866149905, "test_acc5": 86.44400270233155, "epoch": 5, "n_parameters": 86406376} +{"train_lr": 0.003997450867536572, "train_min_lr": 9.496849565928443e-05, "train_loss": 4.1888694992835385, "train_loss_scale": 262563.09512390086, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.5427715507420627, "test_acc1": 65.14400178527832, "test_acc5": 87.7420023727417, "epoch": 6, "n_parameters": 86406376} +{"train_lr": 0.003993082079065952, "train_min_lr": 9.486470519814895e-05, "train_loss": 4.102591318859281, "train_loss_scale": 262144.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.36034827371486, "test_loss": 1.4743010774254799, "test_acc1": 66.52800190216064, "test_acc5": 88.52400257263183, "epoch": 7, "n_parameters": 86406376} +{"train_lr": 0.003986534431346677, "train_min_lr": 9.470915100258478e-05, "train_loss": 4.053977056539697, "train_loss_scale": 264239.4756195044, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.4227886884049936, "test_acc1": 67.17800191864013, "test_acc5": 88.94800263793945, "epoch": 8, "n_parameters": 86406376} +{"train_lr": 0.003977815084135385, "train_min_lr": 9.450200316882093e-05, "train_loss": 4.010094777476207, "train_loss_scale": 262772.6426858513, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 1.3707310209671657, "test_acc1": 67.85000229400634, "test_acc5": 89.35600260375976, "epoch": 9, "n_parameters": 86406376} +{"train_lr": 0.003966933571910235, "train_min_lr": 9.424348820997227e-05, "train_loss": 3.960189716540557, "train_loss_scale": 262144.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2916216715920172, "test_loss": 1.3670287588329026, "test_acc1": 68.58200217468261, "test_acc5": 89.8220024017334, "epoch": 10, "n_parameters": 86406376} +{"train_lr": 0.003953901793445091, "train_min_lr": 9.393388880835123e-05, "train_loss": 3.9119861596446337, "train_loss_scale": 263820.38049560355, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.3387660237424301, "test_acc1": 69.31200221679687, "test_acc5": 90.11600278381347, "epoch": 11, "n_parameters": 86406376} +{"train_lr": 0.0039387339987983876, "train_min_lr": 9.35735435063572e-05, "train_loss": 3.8878280554029296, "train_loss_scale": 262144.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2759411016718851, "test_loss": 1.317402897459088, "test_acc1": 69.93800215942383, "test_acc5": 90.50200250427245, "epoch": 12, "n_parameters": 86406376} +{"train_lr": 0.003921446773730832, "train_min_lr": 9.316284633628823e-05, "train_loss": 3.8405822453310163, "train_loss_scale": 262563.09512390086, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.2674546460762168, "test_acc1": 70.28000240081788, "test_acc5": 90.71600255645752, "epoch": 13, "n_parameters": 86406376} +{"train_lr": 0.003902059021569276, "train_min_lr": 9.270224638946935e-05, "train_loss": 3.822385010268572, "train_loss_scale": 266963.5939248601, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.2905417021476862, "test_acc1": 70.80800233581543, "test_acc5": 90.9200025112915, "epoch": 14, "n_parameters": 86406376} +{"train_lr": 0.0038805919425360376, "train_min_lr": 9.21922473251803e-05, "train_loss": 3.7929010247345643, "train_loss_scale": 262144.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2474000493018367, "test_loss": 1.260368033340483, "test_acc1": 71.12000229278564, "test_acc5": 91.212002449646, "epoch": 15, "n_parameters": 86406376} +{"train_lr": 0.0038570690105668697, "train_min_lr": 9.163340681991e-05, "train_loss": 3.767744296031604, "train_loss_scale": 264868.1183053557, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.2191570794040507, "test_acc1": 71.44800253173828, "test_acc5": 91.17600241455078, "epoch": 16, "n_parameters": 86406376} +{"train_lr": 0.0038315159476425124, "train_min_lr": 9.102633595754594e-05, "train_loss": 3.7525674749811966, "train_loss_scale": 211119.16866506793, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.2244567579843781, "test_acc1": 71.92600230895997, "test_acc5": 91.44800262237548, "epoch": 17, "n_parameters": 86406376} +{"train_lr": 0.003803960695662125, "train_min_lr": 9.037169856116365e-05, "train_loss": 3.7283766990561755, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2429498715175809, "test_loss": 1.221701103629488, "test_acc1": 71.9420024761963, "test_acc5": 91.43000235168456, "epoch": 18, "n_parameters": 86406376} +{"train_lr": 0.003774433385889171, "train_min_lr": 8.96702104671445e-05, "train_loss": 3.7078562758142333, "train_loss_scale": 220339.2613908873, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.186578752416553, "test_acc1": 72.27800240905762, "test_acc5": 91.63400238006592, "epoch": 19, "n_parameters": 86406376} +{"train_lr": 0.0037429663060033736, "train_min_lr": 8.892263874242024e-05, "train_loss": 3.6861478615817216, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.237212994782854, "test_loss": 1.1895164420658892, "test_acc1": 72.70400249084473, "test_acc5": 91.86400274139405, "epoch": 20, "n_parameters": 86406376} +{"train_lr": 0.0037095938647945864, "train_min_lr": 8.812980084569599e-05, "train_loss": 3.6667192394165493, "train_loss_scale": 198022.44604316546, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.23850567306546, "test_loss": 1.1890250189286289, "test_acc1": 72.63400233886719, "test_acc5": 91.81200269256591, "epoch": 21, "n_parameters": 86406376} +{"train_lr": 0.0036743525545373133, "train_min_lr": 8.729256373357168e-05, "train_loss": 3.6495994302294523, "train_loss_scale": 134110.43964828138, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.1797227387626965, "test_acc1": 72.6700023840332, "test_acc5": 91.88400264190673, "epoch": 22, "n_parameters": 86406376} +{"train_lr": 0.0036372809110869152, "train_min_lr": 8.641184291253709e-05, "train_loss": 3.6341962369082927, "train_loss_scale": 133272.2494004796, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.133619707190629, "test_acc1": 73.65200251922607, "test_acc5": 92.42600243255615, "epoch": 23, "n_parameters": 86406376} +{"train_lr": 0.003598419471741207, "train_min_lr": 8.548860143787963e-05, "train_loss": 3.623862175132445, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2397670864963608, "test_loss": 1.146392026182377, "test_acc1": 73.51600252227783, "test_acc5": 92.32200248840331, "epoch": 24, "n_parameters": 86406376} +{"train_lr": 0.0035578107309135816, "train_min_lr": 8.452384886059656e-05, "train_loss": 3.603262832029451, "train_loss_scale": 169943.07274180657, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.1371470603978995, "test_acc1": 73.71600233764649, "test_acc5": 92.34800261566163, "epoch": 25, "n_parameters": 86406376} +{"train_lr": 0.0035154990936659006, "train_min_lr": 8.351864012346724e-05, "train_loss": 3.587060103218237, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2464001103461408, "test_loss": 1.1180579838427631, "test_acc1": 74.0080023236084, "test_acc5": 92.44600270599365, "epoch": 26, "n_parameters": 86406376} +{"train_lr": 0.0034715308271522837, "train_min_lr": 8.24740744074869e-05, "train_loss": 3.5677696583892327, "train_loss_scale": 198651.08872901677, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.1091762039471755, "test_acc1": 74.03400264923096, "test_acc5": 92.5100022680664, "epoch": 27, "n_parameters": 86406376} +{"train_lr": 0.0034259540100266407, "train_min_lr": 8.139129392993009e-05, "train_loss": 3.559775885989626, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.245898213794382, "test_loss": 1.1035979629466028, "test_acc1": 74.22600251861573, "test_acc5": 92.74400266723633, "epoch": 28, "n_parameters": 86406376} +{"train_lr": 0.003378818479869338, "train_min_lr": 8.027148269535177e-05, "train_loss": 3.5414186530500102, "train_loss_scale": 159256.14708233412, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 1.130805738721833, "test_acc1": 74.23600257720948, "test_acc5": 92.69000242462158, "epoch": 29, "n_parameters": 86406376} +{"train_lr": 0.003330175778690558, "train_min_lr": 7.911586520089619e-05, "train_loss": 3.521332262183646, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.25035829624112, "test_loss": 1.094184869843902, "test_acc1": 74.7340027456665, "test_acc5": 92.82000255218506, "epoch": 30, "n_parameters": 86406376} +{"train_lr": 0.0032800790965698953, "train_min_lr": 7.792570509732659e-05, "train_loss": 3.518297063063184, "train_loss_scale": 146054.65067945645, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.0666649909407804, "test_acc1": 75.09000274688721, "test_acc5": 93.040002550354, "epoch": 31, "n_parameters": 86406376} +{"train_lr": 0.003228583213493717, "train_min_lr": 7.670230380724284e-05, "train_loss": 3.503086110956663, "train_loss_scale": 107393.12549960033, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 1.1114820763468742, "test_acc1": 74.80400245452881, "test_acc5": 93.01200267852784, "epoch": 32, "n_parameters": 86406376} +{"train_lr": 0.003175744439454117, "train_min_lr": 7.54469991019936e-05, "train_loss": 3.4825640880018116, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2646351176009571, "test_loss": 1.0542662342389424, "test_acc1": 75.2140026184082, "test_acc5": 93.19200239074706, "epoch": 33, "n_parameters": 86406376} +{"train_lr": 0.0031216205528747424, "train_min_lr": 7.416116363884391e-05, "train_loss": 3.4745392004648843, "train_loss_scale": 115513.09352517985, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2644246759460414, "test_loss": 1.0513502453086954, "test_acc1": 75.61600242980957, "test_acc5": 93.25600278411865, "epoch": 34, "n_parameters": 86406376} +{"train_lr": 0.0030662707374309217, "train_min_lr": 7.28462034599943e-05, "train_loss": 3.464747577345819, "train_loss_scale": 152550.62509992006, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2681104541301347, "test_loss": 1.0377614192213074, "test_acc1": 75.47400229766846, "test_acc5": 93.42200245391845, "epoch": 35, "n_parameters": 86406376} +{"train_lr": 0.0030097555173332125, "train_min_lr": 7.150355645509409e-05, "train_loss": 3.4465882360792275, "train_loss_scale": 152760.1726618705, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.0413121538180294, "test_acc1": 76.0880026235962, "test_acc5": 93.50000260162354, "epoch": 36, "n_parameters": 86406376} +{"train_lr": 0.002952136691145041, "train_min_lr": 7.013469078893113e-05, "train_loss": 3.4340920159094437, "train_loss_scale": 76144.34532374101, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 1.0491064079105854, "test_acc1": 76.29600268890381, "test_acc5": 93.44600247528076, "epoch": 37, "n_parameters": 86406376} +{"train_lr": 0.0028934772642068875, "train_min_lr": 6.874110329601346e-05, "train_loss": 3.4251205486883456, "train_loss_scale": 32768.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2765998900365487, "test_loss": 1.022272811920354, "test_acc1": 75.94400242462159, "test_acc5": 93.47400256835938, "epoch": 38, "n_parameters": 86406376} +{"train_lr": 0.002833841379740885, "train_min_lr": 6.732431784380422e-05, "train_loss": 3.4053156239380367, "train_loss_scale": 55739.651478816944, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.2819314090658624, "test_loss": 1.0281780152158304, "test_acc1": 76.07800248321533, "test_acc5": 93.67800233184815, "epoch": 39, "n_parameters": 86406376} +{"train_lr": 0.0027732942487111276, "train_min_lr": 6.58858836663944e-05, "train_loss": 3.391267495308753, "train_loss_scale": 72241.52198241407, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.284594273109802, "test_loss": 1.0111178991695244, "test_acc1": 76.77200236083985, "test_acc5": 93.71800272125245, "epoch": 40, "n_parameters": 86406376} +{"train_lr": 0.002711902078516466, "train_min_lr": 6.442737367043799e-05, "train_loss": 3.3891672721559956, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.29443743026895, "test_loss": 0.9872939745810899, "test_acc1": 77.08800261322021, "test_acc5": 93.962002182312, "epoch": 41, "n_parameters": 86406376} +{"train_lr": 0.0026497320005936377, "train_min_lr": 6.295038271520207e-05, "train_loss": 3.3599977008968613, "train_loss_scale": 69936.49880095923, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.9843310804078074, "test_acc1": 76.86400251556397, "test_acc5": 93.89400253112792, "epoch": 42, "n_parameters": 86406376} +{"train_lr": 0.002586851997010088, "train_min_lr": 6.145652586861125e-05, "train_loss": 3.350256350710333, "train_loss_scale": 87433.72022382094, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3008638261128769, "test_loss": 1.020211474461989, "test_acc1": 76.77600253479004, "test_acc5": 94.02400250640869, "epoch": 43, "n_parameters": 86406376} +{"train_lr": 0.0025233308261265472, "train_min_lr": 5.994743664119471e-05, "train_loss": 3.3359770345792685, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3001002219083497, "test_loss": 0.9721729861515941, "test_acc1": 77.35200261566162, "test_acc5": 94.17400259613036, "epoch": 44, "n_parameters": 86406376} +{"train_lr": 0.0024592379474107535, "train_min_lr": 5.8424765199866275e-05, "train_loss": 3.321092938859876, "train_loss_scale": 202003.8497202238, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.9848409746632432, "test_acc1": 77.5600025024414, "test_acc5": 94.12200234832764, "epoch": 45, "n_parameters": 86406376} +{"train_lr": 0.0023946434454845933, "train_min_lr": 5.689017656349148e-05, "train_loss": 3.315089634389614, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3133123738588475, "test_loss": 0.9504460749978368, "test_acc1": 77.61200245513916, "test_acc5": 94.41000273010253, "epoch": 46, "n_parameters": 86406376} +{"train_lr": 0.0023296179534875536, "train_min_lr": 5.5345348782213485e-05, "train_loss": 3.295001769975888, "train_loss_scale": 146159.42446043165, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.9754125220757542, "test_acc1": 77.63600252716064, "test_acc5": 94.26800239349365, "epoch": 47, "n_parameters": 86406376} +{"train_lr": 0.0022642325758404375, "train_min_lr": 5.379197110253047e-05, "train_loss": 3.283714706508471, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3198145790923415, "test_loss": 0.9524194603842316, "test_acc1": 77.75800264068603, "test_acc5": 94.39600237579346, "epoch": 48, "n_parameters": 86406376} +{"train_lr": 0.002198558810493715, "train_min_lr": 5.2231742120128454e-05, "train_loss": 3.2698451197452303, "train_loss_scale": 135262.9512390088, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.9604115836096533, "test_acc1": 77.91400236907958, "test_acc5": 94.43600240570068, "epoch": 49, "n_parameters": 86406376} +{"train_lr": 0.0021326684707455917, "train_min_lr": 5.0666367922492635e-05, "train_loss": 3.25602960528182, "train_loss_scale": 136729.78417266186, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.9541752418559609, "test_acc1": 78.10200234954834, "test_acc5": 94.49800243286133, "epoch": 50, "n_parameters": 86406376} +{"train_lr": 0.0020666336067151868, "train_min_lr": 4.909756022332574e-05, "train_loss": 3.2334235293402087, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3345246459845064, "test_loss": 0.9387856721878052, "test_acc1": 78.2120023953247, "test_acc5": 94.5380024432373, "epoch": 51, "n_parameters": 86406376} +{"train_lr": 0.002000526426556805, "train_min_lr": 4.7527034490813655e-05, "train_loss": 3.2196662790483708, "train_loss_scale": 134110.43964828138, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.9254046622099299, "test_acc1": 78.57000256988525, "test_acc5": 94.66000276489258, "epoch": 52, "n_parameters": 86406376} +{"train_lr": 0.0019344192175013665, "train_min_lr": 4.595650807178665e-05, "train_loss": 3.2003976242195407, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.35492267864023, "test_loss": 0.9210042589993188, "test_acc1": 78.64200243713378, "test_acc5": 94.77800249664307, "epoch": 53, "n_parameters": 86406376} +{"train_lr": 0.0018683842668114042, "train_min_lr": 4.438769831382569e-05, "train_loss": 3.1964066105304387, "train_loss_scale": 83347.54276578737, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.9129695732033614, "test_acc1": 78.66000256378175, "test_acc5": 94.82200239837647, "epoch": 54, "n_parameters": 86406376} +{"train_lr": 0.0018024937827359805, "train_min_lr": 4.282232068736817e-05, "train_loss": 3.1861120321267515, "train_loss_scale": 84604.82813749001, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3640874195441925, "test_loss": 0.9064727205444466, "test_acc1": 78.83800238372802, "test_acc5": 94.83600251617432, "epoch": 55, "n_parameters": 86406376} +{"train_lr": 0.001736819815552037, "train_min_lr": 4.1262086909865866e-05, "train_loss": 3.17465291219316, "train_loss_scale": 92069.96003197442, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.9148290763524446, "test_acc1": 79.15800265716553, "test_acc5": 95.03800253784179, "epoch": 56, "n_parameters": 86406376} +{"train_lr": 0.0016714341787784702, "train_min_lr": 3.970870307404734e-05, "train_loss": 3.144534255674989, "train_loss_scale": 32768.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.373606550226585, "test_loss": 0.8905014463446357, "test_acc1": 79.25800266967774, "test_acc5": 95.04000248901367, "epoch": 57, "n_parameters": 86406376} +{"train_lr": 0.0016064083706491157, "train_min_lr": 3.8163867782330694e-05, "train_loss": 3.1398784050004753, "train_loss_scale": 49034.12949640288, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3803659635577366, "test_loss": 0.8827446543357589, "test_acc1": 79.5120025479126, "test_acc5": 95.07800256408692, "epoch": 58, "n_parameters": 86406376} +{"train_lr": 0.001541813495930466, "train_min_lr": 3.6629270289426194e-05, "train_loss": 3.131142016675928, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.384632907230124, "test_loss": 0.8748288392794855, "test_acc1": 79.60600256134033, "test_acc5": 95.28400251556397, "epoch": 59, "n_parameters": 86406376} +{"train_lr": 0.001477720188169644, "train_min_lr": 3.510658865516169e-05, "train_loss": 3.1028173233417395, "train_loss_scale": 124366.47801758593, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.3892139310745313, "test_loss": 0.860443905334581, "test_acc1": 79.77000245788574, "test_acc5": 95.22800262817383, "epoch": 60, "n_parameters": 86406376} +{"train_lr": 0.0014141985324576819, "train_min_lr": 3.3597487909548427e-05, "train_loss": 3.091071269078125, "train_loss_scale": 143749.6274980016, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.8695225790143013, "test_acc1": 79.92200239135742, "test_acc5": 95.23600255096436, "epoch": 61, "n_parameters": 86406376} +{"train_lr": 0.0013513179887924879, "train_min_lr": 3.210361823209538e-05, "train_loss": 3.077250486023897, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4087092674416986, "test_loss": 0.8605208184682962, "test_acc1": 80.09200233276367, "test_acc5": 95.32200256072998, "epoch": 62, "n_parameters": 86406376} +{"train_lr": 0.0012891473161253547, "train_min_lr": 3.062661314736197e-05, "train_loss": 3.061110954192712, "train_loss_scale": 154750.8745003997, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.8431629847848054, "test_acc1": 80.08800251373292, "test_acc5": 95.41400272003173, "epoch": 63, "n_parameters": 86406376} +{"train_lr": 0.0012277544971740653, "train_min_lr": 2.9168087738723258e-05, "train_loss": 3.047550282282509, "train_loss_scale": 126147.63229416466, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.8572965063380472, "test_acc1": 80.30200247467042, "test_acc5": 95.38800263458252, "epoch": 64, "n_parameters": 86406376} +{"train_lr": 0.00116720666408478, "train_min_lr": 2.7729636882299765e-05, "train_loss": 3.0341893219404654, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.425688858417203, "test_loss": 0.8496151291059725, "test_acc1": 80.46200228607178, "test_acc5": 95.4760026309204, "epoch": 65, "n_parameters": 86406376} +{"train_lr": 0.0011075700250240293, "train_min_lr": 2.6312833502983773e-05, "train_loss": 3.017358848647915, "train_loss_scale": 96758.58673061551, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4325794039679756, "test_loss": 0.8399289361003673, "test_acc1": 80.73600263519288, "test_acc5": 95.5740025415039, "epoch": 66, "n_parameters": 86406376} +{"train_lr": 0.0010489097917810446, "train_min_lr": 2.491922685446939e-05, "train_loss": 3.006917386306085, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4392885264161108, "test_loss": 0.828515770873337, "test_acc1": 80.90000254302979, "test_acc5": 95.59400246246338, "epoch": 67, "n_parameters": 86406376} +{"train_lr": 0.0009912901084596437, "train_min_lr": 2.3550340825165907e-05, "train_loss": 2.988169783477684, "train_loss_scale": 139872.99760191847, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.8285546066860358, "test_acc1": 80.79600262023926, "test_acc5": 95.58600239807129, "epoch": 68, "n_parameters": 86406376} +{"train_lr": 0.0009347739813375745, "train_min_lr": 2.2207672271848693e-05, "train_loss": 2.9821149582604614, "train_loss_scale": 141339.83053557156, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.8265396105972204, "test_acc1": 80.84600264892578, "test_acc5": 95.64800258666992, "epoch": 69, "n_parameters": 86406376} +{"train_lr": 0.0008794232099700762, "train_min_lr": 2.0892689382867885e-05, "train_loss": 2.9535677609302633, "train_loss_scale": 78266.0143884892, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.8108141973163142, "test_acc1": 80.98000256774903, "test_acc5": 95.71000276519776, "epoch": 70, "n_parameters": 86406376} +{"train_lr": 0.0008252983196129956, "train_min_lr": 1.960683007270625e-05, "train_loss": 2.939285759135878, "train_loss_scale": 79104.20463629096, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4710120810783929, "test_loss": 0.8041500090198084, "test_acc1": 81.35200231933594, "test_acc5": 95.79800266754151, "epoch": 71, "n_parameters": 86406376} +{"train_lr": 0.0007724584950392799, "train_min_lr": 1.835150040964055e-05, "train_loss": 2.9297137927237173, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4739933761951924, "test_loss": 0.8124989109734694, "test_acc1": 81.53800241424561, "test_acc5": 95.86400275543213, "epoch": 72, "n_parameters": 86406376} +{"train_lr": 0.0007209615158213153, "train_min_lr": 1.7128073078226846e-05, "train_loss": 2.9062990601495398, "train_loss_scale": 143435.30615507593, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.8049701558369579, "test_acc1": 81.50000243896484, "test_acc5": 95.8880024987793, "epoch": 73, "n_parameters": 86406376} +{"train_lr": 0.0006708636931498053, "train_min_lr": 1.5937885878289863e-05, "train_loss": 2.8942230482824702, "train_loss_scale": 131072.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4952019505458864, "test_loss": 0.7943584825279135, "test_acc1": 81.61800241699218, "test_acc5": 95.94200252105713, "epoch": 74, "n_parameters": 86406376} +{"train_lr": 0.0006222198082583128, "train_min_lr": 1.4782240262058604e-05, "train_loss": 2.8828329217614983, "train_loss_scale": 130286.19664268586, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.7842949566741785, "test_acc1": 81.83000252075195, "test_acc5": 96.03400264709472, "epoch": 75, "n_parameters": 86406376} +{"train_lr": 0.0005750830525207831, "train_min_lr": 1.3662399911047398e-05, "train_loss": 2.866628373138529, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.4989759313116828, "test_loss": 0.7911524822314581, "test_acc1": 81.93400256103516, "test_acc5": 95.92400254241943, "epoch": 76, "n_parameters": 86406376} +{"train_lr": 0.0005295049692875612, "train_min_lr": 1.2579589354238723e-05, "train_loss": 2.858981152315982, "train_loss_scale": 107183.57793764988, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.508202315329743, "test_loss": 0.7858635316969771, "test_acc1": 81.97600275817871, "test_acc5": 96.0340025894165, "epoch": 77, "n_parameters": 86406376} +{"train_lr": 0.0004855353975234924, "train_min_lr": 1.1534992629078736e-05, "train_loss": 2.834503419810443, "train_loss_scale": 135891.59392486012, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5148741068790474, "test_loss": 0.7748192102845871, "test_acc1": 82.11600239501954, "test_acc5": 96.06000239318848, "epoch": 78, "n_parameters": 86406376} +{"train_lr": 0.00044322241730974525, "train_min_lr": 1.0529751986749836e-05, "train_loss": 2.8270453691935176, "train_loss_scale": 124680.79936051159, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.7718186390896639, "test_acc1": 82.36600251037598, "test_acc5": 96.13600261871338, "epoch": 79, "n_parameters": 86406376} +{"train_lr": 0.0004026122972689514, "train_min_lr": 9.564966643135621e-06, "train_loss": 2.8028785404231815, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5297076538217058, "test_loss": 0.770141156220978, "test_acc1": 82.1980024874878, "test_acc5": 96.17400264984131, "epoch": 80, "n_parameters": 86406376} +{"train_lr": 0.00036374944397114204, "train_min_lr": 8.64169157684452e-06, "train_loss": 2.7935122771562337, "train_loss_scale": 98958.83613109513, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5345986534556229, "test_loss": 0.760394756992658, "test_acc1": 82.30000256713868, "test_acc5": 96.23800255218507, "epoch": 81, "n_parameters": 86406376} +{"train_lr": 0.00032667635337582196, "train_min_lr": 7.760936375606073e-06, "train_loss": 2.7894287793208465, "train_loss_scale": 80099.55555555556, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.7546397903651902, "test_acc1": 82.45600253967285, "test_acc5": 96.25400263122559, "epoch": 82, "n_parameters": 86406376} +{"train_lr": 0.0002914335643632545, "train_min_lr": 6.92366413230176e-06, "train_loss": 2.7764033634695027, "train_loss_scale": 77270.66346922462, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5368766144311115, "test_loss": 0.7621058657991163, "test_acc1": 82.49200276916504, "test_acc5": 96.29200251678466, "epoch": 83, "n_parameters": 86406376} +{"train_lr": 0.0002580596144057944, "train_min_lr": 6.130790391836908e-06, "train_loss": 2.767983827742932, "train_loss_scale": 90053.06474820143, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.7654316950250756, "test_acc1": 82.48400293182372, "test_acc5": 96.33400266418457, "epoch": 84, "n_parameters": 86406376} +{"train_lr": 0.00022659099742773016, "train_min_lr": 5.383182150005858e-06, "train_loss": 2.753948577064023, "train_loss_scale": 67317.15427657873, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5534542350174425, "test_loss": 0.74941034186067, "test_acc1": 82.6560022869873, "test_acc5": 96.29800259155273, "epoch": 85, "n_parameters": 86406376} +{"train_lr": 0.0001970621238997089, "train_min_lr": 4.6816569054447925e-06, "train_loss": 2.7435265696830125, "train_loss_scale": 101578.18065547562, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.7539658158114462, "test_acc1": 82.74000243957519, "test_acc5": 96.39000235168457, "epoch": 86, "n_parameters": 86406376} +{"train_lr": 0.00016950528321139544, "train_min_lr": 4.026981765708946e-06, "train_loss": 2.735401282946078, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5583705299859234, "test_loss": 0.7509403758202539, "test_acc1": 82.7800020727539, "test_acc5": 96.33400281677245, "epoch": 87, "n_parameters": 86406376} +{"train_lr": 0.00014395060836349695, "train_min_lr": 3.4198726084517775e-06, "train_loss": 2.726337208950834, "train_loss_scale": 94348.78976818545, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.7506256821480665, "test_acc1": 82.76000263183593, "test_acc5": 96.38400246185303, "epoch": 88, "n_parameters": 86406376} +{"train_lr": 0.00012042604301776897, "train_min_lr": 2.8609932986232416e-06, "train_loss": 2.71985708111577, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5624949657659737, "test_loss": 0.7572676462657524, "test_acc1": 82.70600236114502, "test_acc5": 96.33000246185303, "epoch": 89, "n_parameters": 86406376} +{"train_lr": 9.895731094103386e-05, "train_min_lr": 2.350954962543275e-06, "train_loss": 2.720098126777928, "train_loss_scale": 117451.40847322143, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.7487046465729222, "test_acc1": 82.79000234710693, "test_acc5": 96.43000239959717, "epoch": 90, "n_parameters": 86406376} +{"train_lr": 7.95678878766162e-05, "train_min_lr": 1.8903153196440734e-06, "train_loss": 2.7175606226892493, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5719138056063633, "test_loss": 0.7427014463552923, "test_acc1": 82.90000255493165, "test_acc5": 96.41400255584716, "epoch": 91, "n_parameters": 86406376} +{"train_lr": 6.227897587396463e-05, "train_min_lr": 1.47957807261209e-06, "train_loss": 2.705379573871001, "train_loss_scale": 70827.0759392486, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.7494265872872237, "test_acc1": 82.91000229553222, "test_acc5": 96.39600263763428, "epoch": 92, "n_parameters": 86406376} +{"train_lr": 4.710948010452045e-05, "train_min_lr": 1.1191923565965795e-06, "train_loss": 2.702515984027029, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.577457527938983, "test_loss": 0.7469095602405794, "test_acc1": 82.98600246978759, "test_acc5": 96.43200261444092, "epoch": 93, "n_parameters": 86406376} +{"train_lr": 3.407598818919137e-05, "train_min_lr": 8.095522480868717e-07, "train_loss": 2.6927911086762837, "train_loss_scale": 102573.5315747402, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.7422059704408501, "test_acc1": 82.98000233825684, "test_acc5": 96.45200250701905, "epoch": 94, "n_parameters": 86406376} +{"train_lr": 2.319275206003109e-05, "train_min_lr": 5.509963339955348e-07, "train_loss": 2.695670081080674, "train_loss_scale": 65536.0, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.576134430799934, "test_loss": 0.7438320368528366, "test_acc1": 83.03400236755371, "test_acc5": 96.47200257537841, "epoch": 95, "n_parameters": 86406376} +{"train_lr": 1.4471672375960707e-05, "train_min_lr": 3.438073414185469e-07, "train_loss": 2.6918061588832036, "train_loss_scale": 103464.10871302958, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5773896862277024, "test_loss": 0.7437138512279048, "test_acc1": 83.04800273712158, "test_acc5": 96.47400258514405, "epoch": 96, "n_parameters": 86406376} +{"train_lr": 7.922285509573203e-06, "train_min_lr": 1.882118284773719e-07, "train_loss": 2.6919422102012605, "train_loss_scale": 87748.0415667466, "train_weight_decay": 0.049999999999998865, "train_grad_norm": NaN, "test_loss": 0.7450656880709258, "test_acc1": 83.04200254608155, "test_acc5": 96.47800242889404, "epoch": 97, "n_parameters": 86406376} +{"train_lr": 3.551753119249797e-06, "train_min_lr": 8.437993658098134e-08, "train_loss": 2.68782733827472, "train_loss_scale": 69622.17745803358, "train_weight_decay": 0.049999999999998865, "train_grad_norm": 1.5698409668452067, "test_loss": 0.7434264018454335, "test_acc1": 83.03800266601563, "test_acc5": 96.47000250701905, "epoch": 98, "n_parameters": 86406376} +{"train_lr": 1.3648543179918328e-06, "train_min_lr": 3.2425204378719534e-08, "train_loss": 2.6920088201308614, "train_loss_scale": 72713.00399680255, "train_weight_decay": 0.049999999999998865, "train_grad_norm": Infinity, "test_loss": 0.7439092839080276, "test_acc1": 83.044002578125, "test_acc5": 96.47200254608154, "epoch": 99, "n_parameters": 86406376} diff --git a/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_large_0.75_400e.txt b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_large_0.75_400e.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b08de7f7ebdbd29e4daf8f0df4f1886cace4267 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_large_0.75_400e.txt @@ -0,0 +1,399 @@ +{"train_lr": 2.9906242487378794e-05, "train_min_lr": 2.9906242487378794e-05, "train_loss": 0.992053489266441, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.045492654669289596, "epoch": 0, "n_parameters": 329209088} +{"train_lr": 8.991105056494909e-05, "train_min_lr": 8.991105056494909e-05, "train_loss": 0.9692792168830354, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.1093189464464115, "epoch": 1, "n_parameters": 329209088} +{"train_lr": 0.0001499158586425194, "train_min_lr": 0.0001499158586425194, "train_loss": 0.9407013587486477, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.16605017766451988, "epoch": 2, "n_parameters": 329209088} +{"train_lr": 0.00020992066672008975, "train_min_lr": 0.00020992066672008975, "train_loss": 0.8689059213830683, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.28281013352366596, "epoch": 3, "n_parameters": 329209088} +{"train_lr": 0.00026992547479766013, "train_min_lr": 0.00026992547479766013, "train_loss": 0.8189738675032575, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.26077120684278315, "epoch": 4, "n_parameters": 329209088} +{"train_lr": 0.00032993028287523027, "train_min_lr": 0.00032993028287523027, "train_loss": 0.7901057703778721, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.20059755356170428, "epoch": 5, "n_parameters": 329209088} +{"train_lr": 0.0003899350909528006, "train_min_lr": 0.0003899350909528006, "train_loss": 0.7649512290715789, "train_loss_scale": 104395.48717948717, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.1551318422007637, "epoch": 6, "n_parameters": 329209088} +{"train_lr": 0.00044993989903037104, "train_min_lr": 0.00044993989903037104, "train_loss": 0.7463964715970155, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.12646535741022, "epoch": 7, "n_parameters": 329209088} +{"train_lr": 0.0005099447071079412, "train_min_lr": 0.0005099447071079412, "train_loss": 0.7324579154284527, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.11009813217111887, "epoch": 8, "n_parameters": 329209088} +{"train_lr": 0.0005699495151855116, "train_min_lr": 0.0005699495151855116, "train_loss": 0.722336692060941, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0991063874740249, "epoch": 9, "n_parameters": 329209088} +{"train_lr": 0.0006299543232630819, "train_min_lr": 0.0006299543232630819, "train_loss": 0.7136334531570379, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.08660566013975021, "epoch": 10, "n_parameters": 329209088} +{"train_lr": 0.0006899591313406521, "train_min_lr": 0.0006899591313406521, "train_loss": 0.7089173261983654, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0938262621848247, "epoch": 11, "n_parameters": 329209088} +{"train_lr": 0.0007499639394182229, "train_min_lr": 0.0007499639394182229, "train_loss": 0.701731875914937, "train_loss_scale": 155017.84615384616, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.07424946780053851, "epoch": 12, "n_parameters": 329209088} +{"train_lr": 0.0008099687474957929, "train_min_lr": 0.0008099687474957929, "train_loss": 0.6992002461666766, "train_loss_scale": 223914.66666666666, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 13, "n_parameters": 329209088} +{"train_lr": 0.0008699735555733632, "train_min_lr": 0.0008699735555733632, "train_loss": 0.6940862127305127, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0673549733697795, "epoch": 14, "n_parameters": 329209088} +{"train_lr": 0.0009299783636509335, "train_min_lr": 0.0009299783636509335, "train_loss": 0.6911372066690371, "train_loss_scale": 113427.69230769231, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 15, "n_parameters": 329209088} +{"train_lr": 0.0009899831717285039, "train_min_lr": 0.0009899831717285039, "train_loss": 0.687774730577635, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06226646937191104, "epoch": 16, "n_parameters": 329209088} +{"train_lr": 0.001049987979806074, "train_min_lr": 0.001049987979806074, "train_loss": 0.6849430643607122, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05856762180486933, "epoch": 17, "n_parameters": 329209088} +{"train_lr": 0.0011099927878836444, "train_min_lr": 0.0011099927878836444, "train_loss": 0.6832775899973245, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06119824143556448, "epoch": 18, "n_parameters": 329209088} +{"train_lr": 0.0011699975959612145, "train_min_lr": 0.0011699975959612145, "train_loss": 0.6807932547496583, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.055511064015519924, "epoch": 19, "n_parameters": 329209088} +{"train_lr": 0.0012300024040387849, "train_min_lr": 0.0012300024040387849, "train_loss": 0.6789753519488164, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05152321100617067, "epoch": 20, "n_parameters": 329209088} +{"train_lr": 0.0012900072121163552, "train_min_lr": 0.0012900072121163552, "train_loss": 0.6773042290005833, "train_loss_scale": 65536.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05118310434194521, "epoch": 21, "n_parameters": 329209088} +{"train_lr": 0.0013500120201939251, "train_min_lr": 0.0013500120201939251, "train_loss": 0.6757971871441278, "train_loss_scale": 121829.7435897436, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04732887418224262, "epoch": 22, "n_parameters": 329209088} +{"train_lr": 0.0014100168282714964, "train_min_lr": 0.0014100168282714964, "train_loss": 0.674330414374335, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04591951510295845, "epoch": 23, "n_parameters": 329209088} +{"train_lr": 0.001470021636349066, "train_min_lr": 0.001470021636349066, "train_loss": 0.6730096518444136, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04374958989091027, "epoch": 24, "n_parameters": 329209088} +{"train_lr": 0.0015300264444266366, "train_min_lr": 0.0015300264444266366, "train_loss": 0.6722819894516411, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.048778077181524195, "epoch": 25, "n_parameters": 329209088} +{"train_lr": 0.0015900312525042061, "train_min_lr": 0.0015900312525042061, "train_loss": 0.6709313489950429, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04038288148764808, "epoch": 26, "n_parameters": 329209088} +{"train_lr": 0.0016500360605817771, "train_min_lr": 0.0016500360605817771, "train_loss": 0.6697047311478318, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0391668945682259, "epoch": 27, "n_parameters": 329209088} +{"train_lr": 0.0017100408686593481, "train_min_lr": 0.0017100408686593481, "train_loss": 0.6692729077648181, "train_loss_scale": 189886.35897435897, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03930207984283184, "epoch": 28, "n_parameters": 329209088} +{"train_lr": 0.0017700456767369176, "train_min_lr": 0.0017700456767369176, "train_loss": 0.6682791179881837, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.037641401313102014, "epoch": 29, "n_parameters": 329209088} +{"train_lr": 0.0018300504848144882, "train_min_lr": 0.0018300504848144882, "train_loss": 0.6675869743100916, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.035335088757654794, "epoch": 30, "n_parameters": 329209088} +{"train_lr": 0.001890055292892058, "train_min_lr": 0.001890055292892058, "train_loss": 0.6667553564617171, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03407868015197798, "epoch": 31, "n_parameters": 329209088} +{"train_lr": 0.0019500601009696296, "train_min_lr": 0.0019500601009696296, "train_loss": 0.6660889951416697, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.032953488831527725, "epoch": 32, "n_parameters": 329209088} +{"train_lr": 0.0020100649090471997, "train_min_lr": 0.0020100649090471997, "train_loss": 0.6653759362868582, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03271053924273031, "epoch": 33, "n_parameters": 329209088} +{"train_lr": 0.002070069717124769, "train_min_lr": 0.002070069717124769, "train_loss": 0.6649193805284225, "train_loss_scale": 272226.46153846156, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.031562969082584366, "epoch": 34, "n_parameters": 329209088} +{"train_lr": 0.0021300745252023395, "train_min_lr": 0.0021300745252023395, "train_loss": 0.664195560474092, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030752416294163618, "epoch": 35, "n_parameters": 329209088} +{"train_lr": 0.0021900793332799103, "train_min_lr": 0.0021900793332799103, "train_loss": 0.6637147109394368, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030690982066190396, "epoch": 36, "n_parameters": 329209088} +{"train_lr": 0.00225008414135748, "train_min_lr": 0.00225008414135748, "train_loss": 0.6631887465333327, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02963579850844466, "epoch": 37, "n_parameters": 329209088} +{"train_lr": 0.002310088949435051, "train_min_lr": 0.002310088949435051, "train_loss": 0.6626990107664218, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02957984510785303, "epoch": 38, "n_parameters": 329209088} +{"train_lr": 0.0023700937575126205, "train_min_lr": 0.0023700937575126205, "train_loss": 0.6624330086335062, "train_loss_scale": 454130.8717948718, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 39, "n_parameters": 329209088} +{"train_lr": 0.002399984905490592, "train_min_lr": 0.002399984905490592, "train_loss": 0.6618415443047595, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027928925704402037, "epoch": 40, "n_parameters": 329209088} +{"train_lr": 0.002399894048603015, "train_min_lr": 0.002399894048603015, "train_loss": 0.6611817150125996, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027318577664211776, "epoch": 41, "n_parameters": 329209088} +{"train_lr": 0.002399712195907412, "train_min_lr": 0.002399712195907412, "train_loss": 0.6604067583222133, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0273476915481763, "epoch": 42, "n_parameters": 329209088} +{"train_lr": 0.0023994393612525775, "train_min_lr": 0.0023994393612525775, "train_loss": 0.6599090463386323, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02735026071492869, "epoch": 43, "n_parameters": 329209088} +{"train_lr": 0.0023990755654159225, "train_min_lr": 0.0023990755654159225, "train_loss": 0.6592303584926785, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02645150800116169, "epoch": 44, "n_parameters": 329209088} +{"train_lr": 0.00239862083610191, "train_min_lr": 0.00239862083610191, "train_loss": 0.6587562142596699, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026439578869403936, "epoch": 45, "n_parameters": 329209088} +{"train_lr": 0.002398075207939935, "train_min_lr": 0.002398075207939935, "train_loss": 0.6579734718200202, "train_loss_scale": 231476.5128205128, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025895795038638588, "epoch": 46, "n_parameters": 329209088} +{"train_lr": 0.002397438722481704, "train_min_lr": 0.002397438722481704, "train_loss": 0.6576415425500809, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025771908939648897, "epoch": 47, "n_parameters": 329209088} +{"train_lr": 0.002396711428198033, "train_min_lr": 0.002396711428198033, "train_loss": 0.6571225086883761, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025198384188115597, "epoch": 48, "n_parameters": 329209088} +{"train_lr": 0.00239589338047521, "train_min_lr": 0.00239589338047521, "train_loss": 0.6565848011105584, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025053061526985124, "epoch": 49, "n_parameters": 329209088} +{"train_lr": 0.0023949846416107326, "train_min_lr": 0.0023949846416107326, "train_loss": 0.6561868981016465, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02536077789651851, "epoch": 50, "n_parameters": 329209088} +{"train_lr": 0.002393985280808584, "train_min_lr": 0.002393985280808584, "train_loss": 0.6558876298248577, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025169681184566937, "epoch": 51, "n_parameters": 329209088} +{"train_lr": 0.002392895374173956, "train_min_lr": 0.002392895374173956, "train_loss": 0.6553867289808412, "train_loss_scale": 355406.76923076925, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025806471914387286, "epoch": 52, "n_parameters": 329209088} +{"train_lr": 0.002391715004707465, "train_min_lr": 0.002391715004707465, "train_loss": 0.6549916125547427, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02560123818544432, "epoch": 53, "n_parameters": 329209088} +{"train_lr": 0.0023904442622988075, "train_min_lr": 0.0023904442622988075, "train_loss": 0.6546838582159044, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025092326988203403, "epoch": 54, "n_parameters": 329209088} +{"train_lr": 0.002389083243719943, "train_min_lr": 0.002389083243719943, "train_loss": 0.6543559112872642, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025713752799977858, "epoch": 55, "n_parameters": 329209088} +{"train_lr": 0.002387632052617705, "train_min_lr": 0.002387632052617705, "train_loss": 0.6539474247345844, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025064760765347343, "epoch": 56, "n_parameters": 329209088} +{"train_lr": 0.002386090799505915, "train_min_lr": 0.002386090799505915, "train_loss": 0.6536016984818838, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024740835968166206, "epoch": 57, "n_parameters": 329209088} +{"train_lr": 0.002384459601756962, "train_min_lr": 0.002384459601756962, "train_loss": 0.6533582593815831, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024702626334216733, "epoch": 58, "n_parameters": 329209088} +{"train_lr": 0.0023827385835928716, "train_min_lr": 0.0023827385835928716, "train_loss": 0.6529816572124568, "train_loss_scale": 1020009.0256410256, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02484755742793473, "epoch": 59, "n_parameters": 329209088} +{"train_lr": 0.0023809278760758418, "train_min_lr": 0.0023809278760758418, "train_loss": 0.6530672328916785, "train_loss_scale": 1003204.9230769231, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 60, "n_parameters": 329209088} +{"train_lr": 0.0023790276170982585, "train_min_lr": 0.0023790276170982585, "train_loss": 0.6524543827459311, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02469511776087949, "epoch": 61, "n_parameters": 329209088} +{"train_lr": 0.002377037951372201, "train_min_lr": 0.002377037951372201, "train_loss": 0.6522959073205502, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024549558048303693, "epoch": 62, "n_parameters": 329209088} +{"train_lr": 0.0023749590304184146, "train_min_lr": 0.0023749590304184146, "train_loss": 0.652119710151918, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024972357673761554, "epoch": 63, "n_parameters": 329209088} +{"train_lr": 0.002372791012554783, "train_min_lr": 0.002372791012554783, "train_loss": 0.6516253335926777, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02455379637794044, "epoch": 64, "n_parameters": 329209088} +{"train_lr": 0.0023705340628842582, "train_min_lr": 0.0023705340628842582, "train_loss": 0.6515252530061377, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025259264303036988, "epoch": 65, "n_parameters": 329209088} +{"train_lr": 0.002368188353282295, "train_min_lr": 0.002368188353282295, "train_loss": 0.6514551702600259, "train_loss_scale": 147035.89743589744, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 66, "n_parameters": 329209088} +{"train_lr": 0.002365754062383764, "train_min_lr": 0.002365754062383764, "train_loss": 0.6509612936060876, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02523881210706746, "epoch": 67, "n_parameters": 329209088} +{"train_lr": 0.0023632313755693403, "train_min_lr": 0.0023632313755693403, "train_loss": 0.6509679737978448, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026225544983903185, "epoch": 68, "n_parameters": 329209088} +{"train_lr": 0.0023606204849513923, "train_min_lr": 0.0023606204849513923, "train_loss": 0.6506234402589214, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024925658968874276, "epoch": 69, "n_parameters": 329209088} +{"train_lr": 0.002357921589359349, "train_min_lr": 0.002357921589359349, "train_loss": 0.6505284958328001, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024706828360183116, "epoch": 70, "n_parameters": 329209088} +{"train_lr": 0.002355134894324556, "train_min_lr": 0.002355134894324556, "train_loss": 0.6501374820318933, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025143304326308843, "epoch": 71, "n_parameters": 329209088} +{"train_lr": 0.002352260612064637, "train_min_lr": 0.002352260612064637, "train_loss": 0.6499877903335847, "train_loss_scale": 192406.97435897434, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025103452866180584, "epoch": 72, "n_parameters": 329209088} +{"train_lr": 0.002349298961467303, "train_min_lr": 0.002349298961467303, "train_loss": 0.6498041582198288, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024874905625788066, "epoch": 73, "n_parameters": 329209088} +{"train_lr": 0.0023462501680737214, "train_min_lr": 0.0023462501680737214, "train_loss": 0.6496891575949028, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0250762593275748, "epoch": 74, "n_parameters": 329209088} +{"train_lr": 0.002343114464061315, "train_min_lr": 0.002343114464061315, "train_loss": 0.6497200881369795, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026134809634337824, "epoch": 75, "n_parameters": 329209088} +{"train_lr": 0.0023398920882260785, "train_min_lr": 0.0023398920882260785, "train_loss": 0.649291190211303, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025238819754658602, "epoch": 76, "n_parameters": 329209088} +{"train_lr": 0.002336583285964409, "train_min_lr": 0.002336583285964409, "train_loss": 0.6491293019662874, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.024897558375810966, "epoch": 77, "n_parameters": 329209088} +{"train_lr": 0.0023331883092544124, "train_min_lr": 0.0023331883092544124, "train_loss": 0.6490245730592272, "train_loss_scale": 277267.6923076923, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025258563841000583, "epoch": 78, "n_parameters": 329209088} +{"train_lr": 0.0023297074166367046, "train_min_lr": 0.0023297074166367046, "train_loss": 0.6488851320523864, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02498620083460059, "epoch": 79, "n_parameters": 329209088} +{"train_lr": 0.0023261408731947413, "train_min_lr": 0.0023261408731947413, "train_loss": 0.6487186426858012, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026786060567993004, "epoch": 80, "n_parameters": 329209088} +{"train_lr": 0.002322488950534608, "train_min_lr": 0.002322488950534608, "train_loss": 0.6485617144379574, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025100841831702452, "epoch": 81, "n_parameters": 329209088} +{"train_lr": 0.0023187519267643627, "train_min_lr": 0.0023187519267643627, "train_loss": 0.6483423545932732, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02578068504898021, "epoch": 82, "n_parameters": 329209088} +{"train_lr": 0.0023149300864728226, "train_min_lr": 0.0023149300864728226, "train_loss": 0.6483991448170482, "train_loss_scale": 321798.5641025641, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 83, "n_parameters": 329209088} +{"train_lr": 0.00231102372070793, "train_min_lr": 0.00231102372070793, "train_loss": 0.6483515744897513, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026304703420744494, "epoch": 84, "n_parameters": 329209088} +{"train_lr": 0.002307033126954561, "train_min_lr": 0.002307033126954561, "train_loss": 0.6480086933200558, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02522221313885007, "epoch": 85, "n_parameters": 329209088} +{"train_lr": 0.002302958609111882, "train_min_lr": 0.002302958609111882, "train_loss": 0.6478776008320543, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026340058545032755, "epoch": 86, "n_parameters": 329209088} +{"train_lr": 0.002298800477470194, "train_min_lr": 0.002298800477470194, "train_loss": 0.6476758741475164, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025405832703631274, "epoch": 87, "n_parameters": 329209088} +{"train_lr": 0.00229455904868733, "train_min_lr": 0.00229455904868733, "train_loss": 0.6476719009522827, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025368646569311235, "epoch": 88, "n_parameters": 329209088} +{"train_lr": 0.0022902346457645086, "train_min_lr": 0.0022902346457645086, "train_loss": 0.6475359046270546, "train_loss_scale": 357087.1794871795, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025890456590180595, "epoch": 89, "n_parameters": 329209088} +{"train_lr": 0.0022858275980217526, "train_min_lr": 0.0022858275980217526, "train_loss": 0.6474509262957443, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027318660677291263, "epoch": 90, "n_parameters": 329209088} +{"train_lr": 0.0022813382410728175, "train_min_lr": 0.0022813382410728175, "train_loss": 0.6472241824361472, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025515205352208935, "epoch": 91, "n_parameters": 329209088} +{"train_lr": 0.0022767669167996097, "train_min_lr": 0.0022767669167996097, "train_loss": 0.6470353854461931, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025716939881348457, "epoch": 92, "n_parameters": 329209088} +{"train_lr": 0.0022721139733261745, "train_min_lr": 0.0022721139733261745, "train_loss": 0.6468503166837857, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025778588217993576, "epoch": 93, "n_parameters": 329209088} +{"train_lr": 0.00226737976499217, "train_min_lr": 0.00226737976499217, "train_loss": 0.6468086960152365, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02574768583051478, "epoch": 94, "n_parameters": 329209088} +{"train_lr": 0.0022625646523258902, "train_min_lr": 0.0022625646523258902, "train_loss": 0.6466721153925531, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02605553028675226, "epoch": 95, "n_parameters": 329209088} +{"train_lr": 0.002257669002016808, "train_min_lr": 0.002257669002016808, "train_loss": 0.6466963238751468, "train_loss_scale": 720896.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 96, "n_parameters": 329209088} +{"train_lr": 0.002252693186887647, "train_min_lr": 0.002252693186887647, "train_loss": 0.6463773876476364, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0254454029401621, "epoch": 97, "n_parameters": 329209088} +{"train_lr": 0.0022476375858659953, "train_min_lr": 0.0022476375858659953, "train_loss": 0.6464182320468796, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02652996536105489, "epoch": 98, "n_parameters": 329209088} +{"train_lr": 0.0022425025839554473, "train_min_lr": 0.0022425025839554473, "train_loss": 0.6462093366918942, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02581791605394429, "epoch": 99, "n_parameters": 329209088} +{"train_lr": 0.002237288572206275, "train_min_lr": 0.002237288572206275, "train_loss": 0.6461507878385675, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02692990412768454, "epoch": 100, "n_parameters": 329209088} +{"train_lr": 0.00223199594768566, "train_min_lr": 0.00223199594768566, "train_loss": 0.6463096185694807, "train_loss_scale": 492360.2051282051, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 101, "n_parameters": 329209088} +{"train_lr": 0.002226625113447457, "train_min_lr": 0.002226625113447457, "train_loss": 0.6458756862900769, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02566970758832609, "epoch": 102, "n_parameters": 329209088} +{"train_lr": 0.0022211764785014763, "train_min_lr": 0.0022211764785014763, "train_loss": 0.6458334285371865, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02635402318973763, "epoch": 103, "n_parameters": 329209088} +{"train_lr": 0.0022156504577823745, "train_min_lr": 0.0022156504577823745, "train_loss": 0.6456220043787303, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026304775789284553, "epoch": 104, "n_parameters": 329209088} +{"train_lr": 0.0022100474721180197, "train_min_lr": 0.0022100474721180197, "train_loss": 0.6457335386747638, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026793836389119044, "epoch": 105, "n_parameters": 329209088} +{"train_lr": 0.002204367948197461, "train_min_lr": 0.002204367948197461, "train_loss": 0.6454476715149119, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.025944610627797935, "epoch": 106, "n_parameters": 329209088} +{"train_lr": 0.0021986123185384417, "train_min_lr": 0.0021986123185384417, "train_loss": 0.6453419021235254, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02678964802852044, "epoch": 107, "n_parameters": 329209088} +{"train_lr": 0.0021927810214544354, "train_min_lr": 0.0021927810214544354, "train_loss": 0.6456970633843389, "train_loss_scale": 165940.5128205128, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 108, "n_parameters": 329209088} +{"train_lr": 0.0021868745010212983, "train_min_lr": 0.0021868745010212983, "train_loss": 0.6452146430428212, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026297068336787514, "epoch": 109, "n_parameters": 329209088} +{"train_lr": 0.0021808932070434225, "train_min_lr": 0.0021808932070434225, "train_loss": 0.6451982517535679, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028141063226100344, "epoch": 110, "n_parameters": 329209088} +{"train_lr": 0.0021748375950195096, "train_min_lr": 0.0021748375950195096, "train_loss": 0.6447996499900444, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02630796065577903, "epoch": 111, "n_parameters": 329209088} +{"train_lr": 0.0021687081261078573, "train_min_lr": 0.0021687081261078573, "train_loss": 0.6448169751212192, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027201349781348538, "epoch": 112, "n_parameters": 329209088} +{"train_lr": 0.0021625052670912527, "train_min_lr": 0.0021625052670912527, "train_loss": 0.6447285132076687, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02671114440696935, "epoch": 113, "n_parameters": 329209088} +{"train_lr": 0.0021562294903414267, "train_min_lr": 0.0021562294903414267, "train_loss": 0.644528079407815, "train_loss_scale": 173502.35897435897, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02700735426818331, "epoch": 114, "n_parameters": 329209088} +{"train_lr": 0.002149881273783077, "train_min_lr": 0.002149881273783077, "train_loss": 0.644735168856688, "train_loss_scale": 209631.1794871795, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 115, "n_parameters": 329209088} +{"train_lr": 0.0021434611008574723, "train_min_lr": 0.0021434611008574723, "train_loss": 0.6443473706141305, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02649587496685294, "epoch": 116, "n_parameters": 329209088} +{"train_lr": 0.002136969460485639, "train_min_lr": 0.002136969460485639, "train_loss": 0.6443055792448994, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027255071219629966, "epoch": 117, "n_parameters": 329209088} +{"train_lr": 0.002130406847031118, "train_min_lr": 0.002130406847031118, "train_loss": 0.6440789520394248, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027710106474562332, "epoch": 118, "n_parameters": 329209088} +{"train_lr": 0.002123773760262341, "train_min_lr": 0.002123773760262341, "train_loss": 0.6441220993008943, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026872598136273716, "epoch": 119, "n_parameters": 329209088} +{"train_lr": 0.0021170707053145433, "train_min_lr": 0.0021170707053145433, "train_loss": 0.6440309921745211, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027187776214514788, "epoch": 120, "n_parameters": 329209088} +{"train_lr": 0.0021102981926513073, "train_min_lr": 0.0021102981926513073, "train_loss": 0.6437878189190553, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.026802275407438476, "epoch": 121, "n_parameters": 329209088} +{"train_lr": 0.0021034567380257023, "train_min_lr": 0.0021034567380257023, "train_loss": 0.6438411706640648, "train_loss_scale": 260883.6923076923, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027098244187445976, "epoch": 122, "n_parameters": 329209088} +{"train_lr": 0.0020965468624409753, "train_min_lr": 0.0020965468624409753, "train_loss": 0.643669240248318, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02758789159811269, "epoch": 123, "n_parameters": 329209088} +{"train_lr": 0.002089569092110911, "train_min_lr": 0.002089569092110911, "train_loss": 0.6440405128284906, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.031389404189749025, "epoch": 124, "n_parameters": 329209088} +{"train_lr": 0.0020825239584197322, "train_min_lr": 0.0020825239584197322, "train_loss": 0.6435283252623123, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028030019372892685, "epoch": 125, "n_parameters": 329209088} +{"train_lr": 0.0020754119978816502, "train_min_lr": 0.0020754119978816502, "train_loss": 0.6432989513119444, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027008474129849136, "epoch": 126, "n_parameters": 329209088} +{"train_lr": 0.0020682337520999913, "train_min_lr": 0.0020682337520999913, "train_loss": 0.6432904241761813, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028780500828407895, "epoch": 127, "n_parameters": 329209088} +{"train_lr": 0.0020609897677259627, "train_min_lr": 0.0020609897677259627, "train_loss": 0.6433132893393915, "train_loss_scale": 414221.1282051282, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027734479001269508, "epoch": 128, "n_parameters": 329209088} +{"train_lr": 0.0020536805964170256, "train_min_lr": 0.0020536805964170256, "train_loss": 0.6431535486645328, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02747440369775853, "epoch": 129, "n_parameters": 329209088} +{"train_lr": 0.00204630679479487, "train_min_lr": 0.00204630679479487, "train_loss": 0.6430675516752765, "train_loss_scale": 314236.71794871794, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 130, "n_parameters": 329209088} +{"train_lr": 0.002038868924403038, "train_min_lr": 0.002038868924403038, "train_loss": 0.6429448448336468, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02757032032315739, "epoch": 131, "n_parameters": 329209088} +{"train_lr": 0.0020313675516641576, "train_min_lr": 0.0020313675516641576, "train_loss": 0.6428246178532927, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02760386790554875, "epoch": 132, "n_parameters": 329209088} +{"train_lr": 0.002023803247836806, "train_min_lr": 0.002023803247836806, "train_loss": 0.6428641294403814, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028993207758340318, "epoch": 133, "n_parameters": 329209088} +{"train_lr": 0.002016176588972008, "train_min_lr": 0.002016176588972008, "train_loss": 0.6425923044817188, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027386506995520532, "epoch": 134, "n_parameters": 329209088} +{"train_lr": 0.002008488155869361, "train_min_lr": 0.002008488155869361, "train_loss": 0.6427169597516649, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030019870750271738, "epoch": 135, "n_parameters": 329209088} +{"train_lr": 0.002000738534032814, "train_min_lr": 0.002000738534032814, "train_loss": 0.6424605526650945, "train_loss_scale": 364649.0256410256, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027799924089310642, "epoch": 136, "n_parameters": 329209088} +{"train_lr": 0.001992928313626073, "train_min_lr": 0.001992928313626073, "train_loss": 0.6424023582397077, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028908150801912714, "epoch": 137, "n_parameters": 329209088} +{"train_lr": 0.0019850580894276585, "train_min_lr": 0.0019850580894276585, "train_loss": 0.6429825143482631, "train_loss_scale": 420102.5641025641, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 138, "n_parameters": 329209088} +{"train_lr": 0.001977128460785622, "train_min_lr": 0.001977128460785622, "train_loss": 0.6422227818745737, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.027390267067135144, "epoch": 139, "n_parameters": 329209088} +{"train_lr": 0.0019691400315718726, "train_min_lr": 0.0019691400315718726, "train_loss": 0.6421666500063088, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02835275857255627, "epoch": 140, "n_parameters": 329209088} +{"train_lr": 0.0019610934101362376, "train_min_lr": 0.0019610934101362376, "train_loss": 0.6419671408306712, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028144767698951256, "epoch": 141, "n_parameters": 329209088} +{"train_lr": 0.0019529892092600813, "train_min_lr": 0.0019529892092600813, "train_loss": 0.6419400670267164, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02806395894059768, "epoch": 142, "n_parameters": 329209088} +{"train_lr": 0.001944828046109684, "train_min_lr": 0.001944828046109684, "train_loss": 0.6418093704940895, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028449503251184255, "epoch": 143, "n_parameters": 329209088} +{"train_lr": 0.001936610542189214, "train_min_lr": 0.001936610542189214, "train_loss": 0.6417257706037699, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02862915399078375, "epoch": 144, "n_parameters": 329209088} +{"train_lr": 0.0019283373232934099, "train_min_lr": 0.0019283373232934099, "train_loss": 0.6417567568102803, "train_loss_scale": 339442.8717948718, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 145, "n_parameters": 329209088} +{"train_lr": 0.0019200090194599233, "train_min_lr": 0.0019200090194599233, "train_loss": 0.6414070221213385, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028315486177467764, "epoch": 146, "n_parameters": 329209088} +{"train_lr": 0.0019116262649213377, "train_min_lr": 0.0019116262649213377, "train_loss": 0.6413639460224658, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028422319861606527, "epoch": 147, "n_parameters": 329209088} +{"train_lr": 0.0019031896980568602, "train_min_lr": 0.0019031896980568602, "train_loss": 0.6412632769176688, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.029910444234235164, "epoch": 148, "n_parameters": 329209088} +{"train_lr": 0.001894699961343726, "train_min_lr": 0.001894699961343726, "train_loss": 0.6412548693553664, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02833477193967272, "epoch": 149, "n_parameters": 329209088} +{"train_lr": 0.0018861577013082516, "train_min_lr": 0.0018861577013082516, "train_loss": 0.6411763264152867, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028758050061953373, "epoch": 150, "n_parameters": 329209088} +{"train_lr": 0.0018775635684766133, "train_min_lr": 0.0018775635684766133, "train_loss": 0.6409683011125964, "train_loss_scale": 336082.0512820513, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02854277726668769, "epoch": 151, "n_parameters": 329209088} +{"train_lr": 0.0018689182173253027, "train_min_lr": 0.0018689182173253027, "train_loss": 0.640888780910665, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02893657242664351, "epoch": 152, "n_parameters": 329209088} +{"train_lr": 0.0018602223062312783, "train_min_lr": 0.0018602223062312783, "train_loss": 0.6408331939138663, "train_loss_scale": 280628.5128205128, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 153, "n_parameters": 329209088} +{"train_lr": 0.0018514764974218371, "train_min_lr": 0.0018514764974218371, "train_loss": 0.640822907223199, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02887085449690811, "epoch": 154, "n_parameters": 329209088} +{"train_lr": 0.0018426814569241794, "train_min_lr": 0.0018426814569241794, "train_loss": 0.6406777461942954, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028397786454894602, "epoch": 155, "n_parameters": 329209088} +{"train_lr": 0.0018338378545146976, "train_min_lr": 0.0018338378545146976, "train_loss": 0.6405608079288728, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02968214620024157, "epoch": 156, "n_parameters": 329209088} +{"train_lr": 0.0018249463636679463, "train_min_lr": 0.0018249463636679463, "train_loss": 0.6404920960836208, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02901425060386268, "epoch": 157, "n_parameters": 329209088} +{"train_lr": 0.0018160076615053812, "train_min_lr": 0.0018160076615053812, "train_loss": 0.6403778348810588, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.029200542533101562, "epoch": 158, "n_parameters": 329209088} +{"train_lr": 0.0018070224287437813, "train_min_lr": 0.0018070224287437813, "train_loss": 0.6403555395678641, "train_loss_scale": 398257.23076923075, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02986457129009068, "epoch": 159, "n_parameters": 329209088} +{"train_lr": 0.0017979913496434085, "train_min_lr": 0.0017979913496434085, "train_loss": 0.6401424498404734, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.028987549125957184, "epoch": 160, "n_parameters": 329209088} +{"train_lr": 0.001788915111955901, "train_min_lr": 0.001788915111955901, "train_loss": 0.6400157217688572, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02981394166365648, "epoch": 161, "n_parameters": 329209088} +{"train_lr": 0.0017797944068718974, "train_min_lr": 0.0017797944068718974, "train_loss": 0.6401064885027993, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.029524828719261747, "epoch": 162, "n_parameters": 329209088} +{"train_lr": 0.0017706299289684047, "train_min_lr": 0.0017706299289684047, "train_loss": 0.6400016286374571, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.031317353391876586, "epoch": 163, "n_parameters": 329209088} +{"train_lr": 0.0017614223761558967, "train_min_lr": 0.0017614223761558967, "train_loss": 0.639819734926837, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0297823006955859, "epoch": 164, "n_parameters": 329209088} +{"train_lr": 0.001752172449625165, "train_min_lr": 0.001752172449625165, "train_loss": 0.6397659945695733, "train_loss_scale": 581421.9487179487, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.029121420626791242, "epoch": 165, "n_parameters": 329209088} +{"train_lr": 0.0017428808537939323, "train_min_lr": 0.0017428808537939323, "train_loss": 0.6395678837329913, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02940493517029935, "epoch": 166, "n_parameters": 329209088} +{"train_lr": 0.0017335482962531922, "train_min_lr": 0.0017335482962531922, "train_loss": 0.6394719716806251, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030324584625374813, "epoch": 167, "n_parameters": 329209088} +{"train_lr": 0.0017241754877133318, "train_min_lr": 0.0017241754877133318, "train_loss": 0.6393246623174025, "train_loss_scale": 578061.1282051282, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 168, "n_parameters": 329209088} +{"train_lr": 0.0017147631419500143, "train_min_lr": 0.0017147631419500143, "train_loss": 0.6394077892271945, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03154762937591817, "epoch": 169, "n_parameters": 329209088} +{"train_lr": 0.0017053119757498118, "train_min_lr": 0.0017053119757498118, "train_loss": 0.6392433139650772, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.02979543413489293, "epoch": 170, "n_parameters": 329209088} +{"train_lr": 0.001695822708855617, "train_min_lr": 0.001695822708855617, "train_loss": 0.6391462445593415, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.031601412108359046, "epoch": 171, "n_parameters": 329209088} +{"train_lr": 0.001686296063911845, "train_min_lr": 0.001686296063911845, "train_loss": 0.6389658685940771, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03043923623716602, "epoch": 172, "n_parameters": 329209088} +{"train_lr": 0.0016767327664093945, "train_min_lr": 0.0016767327664093945, "train_loss": 0.638979444357877, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03034274318876366, "epoch": 173, "n_parameters": 329209088} +{"train_lr": 0.0016671335446303921, "train_min_lr": 0.0016671335446303921, "train_loss": 0.6388872074476706, "train_loss_scale": 779710.358974359, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03126039603151954, "epoch": 174, "n_parameters": 329209088} +{"train_lr": 0.0016574991295927436, "train_min_lr": 0.0016574991295927436, "train_loss": 0.6387193798720359, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030189521562976714, "epoch": 175, "n_parameters": 329209088} +{"train_lr": 0.001647830254994458, "train_min_lr": 0.001647830254994458, "train_loss": 0.6386389592363952, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030193925399381, "epoch": 176, "n_parameters": 329209088} +{"train_lr": 0.0016381276571577643, "train_min_lr": 0.0016381276571577643, "train_loss": 0.6385088559263983, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030736114835748687, "epoch": 177, "n_parameters": 329209088} +{"train_lr": 0.0016283920749730564, "train_min_lr": 0.0016283920749730564, "train_loss": 0.6391069796593047, "train_loss_scale": 138213.7435897436, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 178, "n_parameters": 329209088} +{"train_lr": 0.0016186242498426112, "train_min_lr": 0.0016186242498426112, "train_loss": 0.6384479778651626, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030623054800507348, "epoch": 179, "n_parameters": 329209088} +{"train_lr": 0.0016088249256241284, "train_min_lr": 0.0016088249256241284, "train_loss": 0.6382542708172247, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03130309429401771, "epoch": 180, "n_parameters": 329209088} +{"train_lr": 0.0015989948485740878, "train_min_lr": 0.0015989948485740878, "train_loss": 0.6381903741604242, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030516292607316222, "epoch": 181, "n_parameters": 329209088} +{"train_lr": 0.0015891347672909151, "train_min_lr": 0.0015891347672909151, "train_loss": 0.6380550607752341, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030672436036790412, "epoch": 182, "n_parameters": 329209088} +{"train_lr": 0.0015792454326579762, "train_min_lr": 0.0015792454326579762, "train_loss": 0.6379414519104056, "train_loss_scale": 131072.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.030873728098156743, "epoch": 183, "n_parameters": 329209088} +{"train_lr": 0.00156932759778639, "train_min_lr": 0.00156932759778639, "train_loss": 0.637759269018156, "train_loss_scale": 206270.35897435897, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03118793157120355, "epoch": 184, "n_parameters": 329209088} +{"train_lr": 0.00155938201795768, "train_min_lr": 0.00155938201795768, "train_loss": 0.6378241749540067, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.031131858113580026, "epoch": 185, "n_parameters": 329209088} +{"train_lr": 0.0015494094505662558, "train_min_lr": 0.0015494094505662558, "train_loss": 0.6378064515033307, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03346967529983093, "epoch": 186, "n_parameters": 329209088} +{"train_lr": 0.001539410655061736, "train_min_lr": 0.001539410655061736, "train_loss": 0.6375938148166125, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.031042226530515995, "epoch": 187, "n_parameters": 329209088} +{"train_lr": 0.0015293863928911096, "train_min_lr": 0.0015293863928911096, "train_loss": 0.637503370248641, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03165703466854607, "epoch": 188, "n_parameters": 329209088} +{"train_lr": 0.0015193374274407522, "train_min_lr": 0.0015193374274407522, "train_loss": 0.6373424760651034, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.032703791452476226, "epoch": 189, "n_parameters": 329209088} +{"train_lr": 0.00150926452397829, "train_min_lr": 0.00150926452397829, "train_loss": 0.6372924632189844, "train_loss_scale": 304994.46153846156, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03171496963104568, "epoch": 190, "n_parameters": 329209088} +{"train_lr": 0.0014991684495943168, "train_min_lr": 0.0014991684495943168, "train_loss": 0.6371971159839095, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03190691110033255, "epoch": 191, "n_parameters": 329209088} +{"train_lr": 0.0014890499731439859, "train_min_lr": 0.0014890499731439859, "train_loss": 0.6370357151728314, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03221662750897499, "epoch": 192, "n_parameters": 329209088} +{"train_lr": 0.0014789098651884587, "train_min_lr": 0.0014789098651884587, "train_loss": 0.6369895840732333, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03217489603691949, "epoch": 193, "n_parameters": 329209088} +{"train_lr": 0.0014687488979362115, "train_min_lr": 0.0014687488979362115, "train_loss": 0.6369786758023577, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.033310710631597504, "epoch": 194, "n_parameters": 329209088} +{"train_lr": 0.0014585678451842408, "train_min_lr": 0.0014585678451842408, "train_loss": 0.6367120626143729, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03195363310022423, "epoch": 195, "n_parameters": 329209088} +{"train_lr": 0.001448367482259133, "train_min_lr": 0.001448367482259133, "train_loss": 0.6366518784362154, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03254236969858026, "epoch": 196, "n_parameters": 329209088} +{"train_lr": 0.001438148585958014, "train_min_lr": 0.001438148585958014, "train_loss": 0.6365046131078345, "train_loss_scale": 919184.4102564103, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.032628384771016546, "epoch": 197, "n_parameters": 329209088} +{"train_lr": 0.0014279119344894028, "train_min_lr": 0.0014279119344894028, "train_loss": 0.6365064023396908, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.032482835076128445, "epoch": 198, "n_parameters": 329209088} +{"train_lr": 0.001417658307413943, "train_min_lr": 0.001417658307413943, "train_loss": 0.6363673461839939, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03294227854348719, "epoch": 199, "n_parameters": 329209088} +{"train_lr": 0.0014073884855850317, "train_min_lr": 0.0014073884855850317, "train_loss": 0.6362196238448795, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.032725461066151276, "epoch": 200, "n_parameters": 329209088} +{"train_lr": 0.0013971032510893652, "train_min_lr": 0.0013971032510893652, "train_loss": 0.6361585415016191, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0329327087264317, "epoch": 201, "n_parameters": 329209088} +{"train_lr": 0.0013868033871873699, "train_min_lr": 0.0013868033871873699, "train_loss": 0.6360381598441074, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.032554248007587515, "epoch": 202, "n_parameters": 329209088} +{"train_lr": 0.0013764896782535608, "train_min_lr": 0.0013764896782535608, "train_loss": 0.6358811189360821, "train_loss_scale": 1408183.7948717948, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03306842551757701, "epoch": 203, "n_parameters": 329209088} +{"train_lr": 0.0013661629097168016, "train_min_lr": 0.0013661629097168016, "train_loss": 0.6357421993683928, "train_loss_scale": 836844.3076923077, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 204, "n_parameters": 329209088} +{"train_lr": 0.0013558238680005013, "train_min_lr": 0.0013558238680005013, "train_loss": 0.6357626439705013, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03339464004294804, "epoch": 205, "n_parameters": 329209088} +{"train_lr": 0.001345473340462714, "train_min_lr": 0.001345473340462714, "train_loss": 0.6355263965192418, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03334607285423539, "epoch": 206, "n_parameters": 329209088} +{"train_lr": 0.0013351121153361868, "train_min_lr": 0.0013351121153361868, "train_loss": 0.6355979875183831, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.033490136988126695, "epoch": 207, "n_parameters": 329209088} +{"train_lr": 0.0013247409816683248, "train_min_lr": 0.0013247409816683248, "train_loss": 0.6353928542946681, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.033524584132604875, "epoch": 208, "n_parameters": 329209088} +{"train_lr": 0.001314360729261115, "train_min_lr": 0.001314360729261115, "train_loss": 0.6353181417231472, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03348290323255918, "epoch": 209, "n_parameters": 329209088} +{"train_lr": 0.0013039721486109638, "train_min_lr": 0.0013039721486109638, "train_loss": 0.6351780112510403, "train_loss_scale": 725937.2307692308, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03518244367427169, "epoch": 210, "n_parameters": 329209088} +{"train_lr": 0.0012935760308485087, "train_min_lr": 0.0012935760308485087, "train_loss": 0.6349511720622197, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03371304626433322, "epoch": 211, "n_parameters": 329209088} +{"train_lr": 0.001283173167678369, "train_min_lr": 0.001283173167678369, "train_loss": 0.6349017564087915, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03395078774804297, "epoch": 212, "n_parameters": 329209088} +{"train_lr": 0.0012727643513188527, "train_min_lr": 0.0012727643513188527, "train_loss": 0.6347861846443266, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03394038671961962, "epoch": 213, "n_parameters": 329209088} +{"train_lr": 0.0012623503744416211, "train_min_lr": 0.0012623503744416211, "train_loss": 0.6347843402017577, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03449265233193261, "epoch": 214, "n_parameters": 329209088} +{"train_lr": 0.0012519320301113358, "train_min_lr": 0.0012519320301113358, "train_loss": 0.6345033211132082, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03418060139012642, "epoch": 215, "n_parameters": 329209088} +{"train_lr": 0.001241510111725253, "train_min_lr": 0.001241510111725253, "train_loss": 0.6344598844850388, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03476034481532107, "epoch": 216, "n_parameters": 329209088} +{"train_lr": 0.0012310854129528052, "train_min_lr": 0.0012310854129528052, "train_loss": 0.634398295944079, "train_loss_scale": 1377936.4102564103, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 217, "n_parameters": 329209088} +{"train_lr": 0.0012206587276751707, "train_min_lr": 0.0012206587276751707, "train_loss": 0.6343038002303683, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03473580542665262, "epoch": 218, "n_parameters": 329209088} +{"train_lr": 0.0012102308499247973, "train_min_lr": 0.0012102308499247973, "train_loss": 0.6341536501124023, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03448977081988675, "epoch": 219, "n_parameters": 329209088} +{"train_lr": 0.0011998025738249494, "train_min_lr": 0.0011998025738249494, "train_loss": 0.633975476725027, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03488687019293698, "epoch": 220, "n_parameters": 329209088} +{"train_lr": 0.0011893746935292267, "train_min_lr": 0.0011893746935292267, "train_loss": 0.6341097245577914, "train_loss_scale": 572179.6923076923, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 221, "n_parameters": 329209088} +{"train_lr": 0.0011789480031610881, "train_min_lr": 0.0011789480031610881, "train_loss": 0.6338705676189886, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03692805854818569, "epoch": 222, "n_parameters": 329209088} +{"train_lr": 0.0011685232967533748, "train_min_lr": 0.0011685232967533748, "train_loss": 0.6337018027567328, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03534728240293379, "epoch": 223, "n_parameters": 329209088} +{"train_lr": 0.0011581013681878376, "train_min_lr": 0.0011581013681878376, "train_loss": 0.6336842688421408, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03566962817253975, "epoch": 224, "n_parameters": 329209088} +{"train_lr": 0.0011476830111346887, "train_min_lr": 0.0011476830111346887, "train_loss": 0.6334512306329532, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03549365215719892, "epoch": 225, "n_parameters": 329209088} +{"train_lr": 0.0011372690189921531, "train_min_lr": 0.0011372690189921531, "train_loss": 0.6333092775446578, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.035773792053357914, "epoch": 226, "n_parameters": 329209088} +{"train_lr": 0.0011268601848260535, "train_min_lr": 0.0011268601848260535, "train_loss": 0.633405420368012, "train_loss_scale": 311716.10256410256, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.037723404104606464, "epoch": 227, "n_parameters": 329209088} +{"train_lr": 0.0011164573013094075, "train_min_lr": 0.0011164573013094075, "train_loss": 0.6331540410144206, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03605122995586731, "epoch": 228, "n_parameters": 329209088} +{"train_lr": 0.001106061160662077, "train_min_lr": 0.001106061160662077, "train_loss": 0.6329986882419922, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.035730649723122135, "epoch": 229, "n_parameters": 329209088} +{"train_lr": 0.0010956725545904168, "train_min_lr": 0.0010956725545904168, "train_loss": 0.6328313132592788, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03610342946978143, "epoch": 230, "n_parameters": 329209088} +{"train_lr": 0.0010852922742270053, "train_min_lr": 0.0010852922742270053, "train_loss": 0.6328896881761745, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03644952149345325, "epoch": 231, "n_parameters": 329209088} +{"train_lr": 0.0010749211100703794, "train_min_lr": 0.0010749211100703794, "train_loss": 0.6327227253872806, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.036498260899231985, "epoch": 232, "n_parameters": 329209088} +{"train_lr": 0.0010645598519248383, "train_min_lr": 0.0010645598519248383, "train_loss": 0.6326258556248668, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03659982502293319, "epoch": 233, "n_parameters": 329209088} +{"train_lr": 0.0010542092888403117, "train_min_lr": 0.0010542092888403117, "train_loss": 0.6326706966934487, "train_loss_scale": 676365.1282051282, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 234, "n_parameters": 329209088} +{"train_lr": 0.0010438702090522496, "train_min_lr": 0.0010438702090522496, "train_loss": 0.6324654672927676, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03658811449526976, "epoch": 235, "n_parameters": 329209088} +{"train_lr": 0.001033543399921608, "train_min_lr": 0.001033543399921608, "train_loss": 0.6322615499763439, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03713475795200047, "epoch": 236, "n_parameters": 329209088} +{"train_lr": 0.001023229647874884, "train_min_lr": 0.001023229647874884, "train_loss": 0.6321005097960528, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03762158864321044, "epoch": 237, "n_parameters": 329209088} +{"train_lr": 0.0010129297383442272, "train_min_lr": 0.0010129297383442272, "train_loss": 0.6319690402358388, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03752801210309068, "epoch": 238, "n_parameters": 329209088} +{"train_lr": 0.0010026444557076238, "train_min_lr": 0.0010026444557076238, "train_loss": 0.6319760522888734, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03825479864071195, "epoch": 239, "n_parameters": 329209088} +{"train_lr": 0.000992374583229171, "train_min_lr": 0.000992374583229171, "train_loss": 0.6317316137779599, "train_loss_scale": 262144.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.037378734592587136, "epoch": 240, "n_parameters": 329209088} +{"train_lr": 0.0009821209029994167, "train_min_lr": 0.0009821209029994167, "train_loss": 0.6315415556339595, "train_loss_scale": 501602.46153846156, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03766573099897076, "epoch": 241, "n_parameters": 329209088} +{"train_lr": 0.0009718841958758109, "train_min_lr": 0.0009718841958758109, "train_loss": 0.6314926285368319, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.037886448514958225, "epoch": 242, "n_parameters": 329209088} +{"train_lr": 0.0009616652414232358, "train_min_lr": 0.0009616652414232358, "train_loss": 0.6313681731251284, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04038268268012848, "epoch": 243, "n_parameters": 329209088} +{"train_lr": 0.0009514648178546331, "train_min_lr": 0.0009514648178546331, "train_loss": 0.63123246973667, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.038252633124685444, "epoch": 244, "n_parameters": 329209088} +{"train_lr": 0.0009412837019717529, "train_min_lr": 0.0009412837019717529, "train_loss": 0.6311447082194858, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03832736581516189, "epoch": 245, "n_parameters": 329209088} +{"train_lr": 0.0009311226691059865, "train_min_lr": 0.0009311226691059865, "train_loss": 0.6310885321790687, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03854645136743784, "epoch": 246, "n_parameters": 329209088} +{"train_lr": 0.0009209824930593261, "train_min_lr": 0.0009209824930593261, "train_loss": 0.6309239633548527, "train_loss_scale": 788112.4102564103, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.038729565659872234, "epoch": 247, "n_parameters": 329209088} +{"train_lr": 0.0009108639460454382, "train_min_lr": 0.0009108639460454382, "train_loss": 0.6307369192035344, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.038624203662411906, "epoch": 248, "n_parameters": 329209088} +{"train_lr": 0.0009007677986308538, "train_min_lr": 0.0009007677986308538, "train_loss": 0.6306912183570557, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03909732751213969, "epoch": 249, "n_parameters": 329209088} +{"train_lr": 0.0008906948196762859, "train_min_lr": 0.0008906948196762859, "train_loss": 0.6305867205493343, "train_loss_scale": 767947.4871794871, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 250, "n_parameters": 329209088} +{"train_lr": 0.000880645776278082, "train_min_lr": 0.000880645776278082, "train_loss": 0.6305206664419996, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.03939954353830753, "epoch": 251, "n_parameters": 329209088} +{"train_lr": 0.000870621433709802, "train_min_lr": 0.000870621433709802, "train_loss": 0.6303684700769969, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.039426080082567073, "epoch": 252, "n_parameters": 329209088} +{"train_lr": 0.0008606225553639454, "train_min_lr": 0.0008606225553639454, "train_loss": 0.6300959098212516, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.039489052150971614, "epoch": 253, "n_parameters": 329209088} +{"train_lr": 0.0008506499026938082, "train_min_lr": 0.0008506499026938082, "train_loss": 0.6300412956446123, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.040096134365273595, "epoch": 254, "n_parameters": 329209088} +{"train_lr": 0.0008407042351555041, "train_min_lr": 0.0008407042351555041, "train_loss": 0.6299278211947053, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.040457814442328155, "epoch": 255, "n_parameters": 329209088} +{"train_lr": 0.0008307863101501201, "train_min_lr": 0.0008307863101501201, "train_loss": 0.629832218048903, "train_loss_scale": 589824.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04043925850867079, "epoch": 256, "n_parameters": 329209088} +{"train_lr": 0.0008208968829660467, "train_min_lr": 0.0008208968829660467, "train_loss": 0.629643039473404, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04047447609571883, "epoch": 257, "n_parameters": 329209088} +{"train_lr": 0.0008110367067214505, "train_min_lr": 0.0008110367067214505, "train_loss": 0.6294997143559158, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04085779420506114, "epoch": 258, "n_parameters": 329209088} +{"train_lr": 0.0008012065323069283, "train_min_lr": 0.0008012065323069283, "train_loss": 0.6294634166555718, "train_loss_scale": 793153.641025641, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 259, "n_parameters": 329209088} +{"train_lr": 0.0007914071083283216, "train_min_lr": 0.0007914071083283216, "train_loss": 0.6292331363372983, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.041266080809948154, "epoch": 260, "n_parameters": 329209088} +{"train_lr": 0.0007816391810497043, "train_min_lr": 0.0007816391810497043, "train_loss": 0.6290712053827846, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.040999528557921834, "epoch": 261, "n_parameters": 329209088} +{"train_lr": 0.0007719034943365599, "train_min_lr": 0.0007719034943365599, "train_loss": 0.6290712010963128, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04137831624859992, "epoch": 262, "n_parameters": 329209088} +{"train_lr": 0.0007622007895991216, "train_min_lr": 0.0007622007895991216, "train_loss": 0.6289421918933303, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04181592774768479, "epoch": 263, "n_parameters": 329209088} +{"train_lr": 0.0007525318057359234, "train_min_lr": 0.0007525318057359234, "train_loss": 0.6286877748460915, "train_loss_scale": 524288.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.042315053031182825, "epoch": 264, "n_parameters": 329209088} +{"train_lr": 0.0007428972790775184, "train_min_lr": 0.0007428972790775184, "train_loss": 0.6285432770382613, "train_loss_scale": 564617.8461538461, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.041562357905487984, "epoch": 265, "n_parameters": 329209088} +{"train_lr": 0.0007332979433304175, "train_min_lr": 0.0007332979433304175, "train_loss": 0.6284733923175969, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.042345505118226774, "epoch": 266, "n_parameters": 329209088} +{"train_lr": 0.0007237345295211991, "train_min_lr": 0.0007237345295211991, "train_loss": 0.6282531876260271, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04248199032810636, "epoch": 267, "n_parameters": 329209088} +{"train_lr": 0.0007142077659408526, "train_min_lr": 0.0007142077659408526, "train_loss": 0.6282466627979795, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04271088121458888, "epoch": 268, "n_parameters": 329209088} +{"train_lr": 0.0007047183780893101, "train_min_lr": 0.0007047183780893101, "train_loss": 0.6280665331078359, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04285730222741572, "epoch": 269, "n_parameters": 329209088} +{"train_lr": 0.0006952670886201941, "train_min_lr": 0.0006952670886201941, "train_loss": 0.6278973401118165, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04347971167701941, "epoch": 270, "n_parameters": 329209088} +{"train_lr": 0.0006858546172857918, "train_min_lr": 0.0006858546172857918, "train_loss": 0.6276867646926966, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.043224812276327074, "epoch": 271, "n_parameters": 329209088} +{"train_lr": 0.0006764816808822354, "train_min_lr": 0.0006764816808822354, "train_loss": 0.6276818952308252, "train_loss_scale": 1747626.6666666667, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04360416929165904, "epoch": 272, "n_parameters": 329209088} +{"train_lr": 0.0006671489931949225, "train_min_lr": 0.0006671489931949225, "train_loss": 0.6275735991254735, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.043848731280423894, "epoch": 273, "n_parameters": 329209088} +{"train_lr": 0.000657857264944153, "train_min_lr": 0.000657857264944153, "train_loss": 0.6273718408834285, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0437518492138061, "epoch": 274, "n_parameters": 329209088} +{"train_lr": 0.0006486072037310056, "train_min_lr": 0.0006486072037310056, "train_loss": 0.6271319916018118, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04413281639631933, "epoch": 275, "n_parameters": 329209088} +{"train_lr": 0.0006393995139834574, "train_min_lr": 0.0006393995139834574, "train_loss": 0.6272063511364067, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04504153312733158, "epoch": 276, "n_parameters": 329209088} +{"train_lr": 0.0006302348969027305, "train_min_lr": 0.0006302348969027305, "train_loss": 0.6269641043857123, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04458890359800977, "epoch": 277, "n_parameters": 329209088} +{"train_lr": 0.0006211140504098989, "train_min_lr": 0.0006211140504098989, "train_loss": 0.6268549130166857, "train_loss_scale": 2634883.282051282, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04499488609293715, "epoch": 278, "n_parameters": 329209088} +{"train_lr": 0.0006120376690927338, "train_min_lr": 0.0006120376690927338, "train_loss": 0.6266899533056391, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04436291022918736, "epoch": 279, "n_parameters": 329209088} +{"train_lr": 0.0006030064441528149, "train_min_lr": 0.0006030064441528149, "train_loss": 0.6265607187166237, "train_loss_scale": 3475088.4102564105, "train_weight_decay": 0.05000000000000026, "train_grad_norm": NaN, "epoch": 280, "n_parameters": 329209088} +{"train_lr": 0.0005940210633528858, "train_min_lr": 0.0005940210633528858, "train_loss": 0.6264487812176156, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.045277594964807995, "epoch": 281, "n_parameters": 329209088} +{"train_lr": 0.0005850822109644842, "train_min_lr": 0.0005850822109644842, "train_loss": 0.6262761797839537, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04600498614570078, "epoch": 282, "n_parameters": 329209088} +{"train_lr": 0.0005761905677158267, "train_min_lr": 0.0005761905677158267, "train_loss": 0.6261774396034292, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.045497196445910215, "epoch": 283, "n_parameters": 329209088} +{"train_lr": 0.0005673468107399736, "train_min_lr": 0.0005673468107399736, "train_loss": 0.625907331937924, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0460289020736057, "epoch": 284, "n_parameters": 329209088} +{"train_lr": 0.0005585516135232554, "train_min_lr": 0.0005585516135232554, "train_loss": 0.625940507862908, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04634552927783284, "epoch": 285, "n_parameters": 329209088} +{"train_lr": 0.0005498056458539953, "train_min_lr": 0.0005498056458539953, "train_loss": 0.625758196412323, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0464239325792266, "epoch": 286, "n_parameters": 329209088} +{"train_lr": 0.000541109573771491, "train_min_lr": 0.000541109573771491, "train_loss": 0.6255990909830405, "train_loss_scale": 3065068.3076923075, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 287, "n_parameters": 329209088} +{"train_lr": 0.0005324640595153003, "train_min_lr": 0.0005324640595153003, "train_loss": 0.6255322515976448, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04726804811985065, "epoch": 288, "n_parameters": 329209088} +{"train_lr": 0.0005238697614748063, "train_min_lr": 0.0005238697614748063, "train_loss": 0.6253679469406891, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04715808993205428, "epoch": 289, "n_parameters": 329209088} +{"train_lr": 0.0005153273341390795, "train_min_lr": 0.0005153273341390795, "train_loss": 0.6250885888361014, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.047379129578192264, "epoch": 290, "n_parameters": 329209088} +{"train_lr": 0.0005068374280470331, "train_min_lr": 0.0005068374280470331, "train_loss": 0.6250022890117879, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04771568857205029, "epoch": 291, "n_parameters": 329209088} +{"train_lr": 0.0004984006897378886, "train_min_lr": 0.0004984006897378886, "train_loss": 0.6249012614313799, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0476855332007966, "epoch": 292, "n_parameters": 329209088} +{"train_lr": 0.0004900177617019308, "train_min_lr": 0.0004900177617019308, "train_loss": 0.6247432591059269, "train_loss_scale": 2224863.1794871795, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04815057661527625, "epoch": 293, "n_parameters": 329209088} +{"train_lr": 0.00048168928233158535, "train_min_lr": 0.00048168928233158535, "train_loss": 0.6245633363962556, "train_loss_scale": 3347377.230769231, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 294, "n_parameters": 329209088} +{"train_lr": 0.0004734158858728016, "train_min_lr": 0.0004734158858728016, "train_loss": 0.624455152085433, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.048516754956486136, "epoch": 295, "n_parameters": 329209088} +{"train_lr": 0.00046519820237675105, "train_min_lr": 0.00046519820237675105, "train_loss": 0.6243504222112302, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04871210630218952, "epoch": 296, "n_parameters": 329209088} +{"train_lr": 0.0004570368576518498, "train_min_lr": 0.0004570368576518498, "train_loss": 0.6241685947797333, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.049167035069937505, "epoch": 297, "n_parameters": 329209088} +{"train_lr": 0.00044893247321609476, "train_min_lr": 0.00044893247321609476, "train_loss": 0.62405608387855, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04924473509144706, "epoch": 298, "n_parameters": 329209088} +{"train_lr": 0.0004408856662497389, "train_min_lr": 0.0004408856662497389, "train_loss": 0.6238690452674069, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04995741456364974, "epoch": 299, "n_parameters": 329209088} +{"train_lr": 0.00043289704954828676, "train_min_lr": 0.00043289704954828676, "train_loss": 0.623728606205147, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.04916721060633277, "epoch": 300, "n_parameters": 329209088} +{"train_lr": 0.0004249672314758303, "train_min_lr": 0.0004249672314758303, "train_loss": 0.6235619415529072, "train_loss_scale": 4180860.717948718, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.049929130559739396, "epoch": 301, "n_parameters": 329209088} +{"train_lr": 0.0004170968159187159, "train_min_lr": 0.0004170968159187159, "train_loss": 0.6234131246590271, "train_loss_scale": 2386182.564102564, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 302, "n_parameters": 329209088} +{"train_lr": 0.0004092864022395612, "train_min_lr": 0.0004092864022395612, "train_loss": 0.6233146814689136, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05043820763388888, "epoch": 303, "n_parameters": 329209088} +{"train_lr": 0.00040153658523160577, "train_min_lr": 0.00040153658523160577, "train_loss": 0.6232048119179522, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.050879253110346884, "epoch": 304, "n_parameters": 329209088} +{"train_lr": 0.00039384795507342054, "train_min_lr": 0.00039384795507342054, "train_loss": 0.6230469430462481, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.050744131279106326, "epoch": 305, "n_parameters": 329209088} +{"train_lr": 0.0003862210972839592, "train_min_lr": 0.0003862210972839592, "train_loss": 0.6228423380675033, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05175772600640089, "epoch": 306, "n_parameters": 329209088} +{"train_lr": 0.00037865659267797083, "train_min_lr": 0.00037865659267797083, "train_loss": 0.622784810128789, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05081591845896, "epoch": 307, "n_parameters": 329209088} +{"train_lr": 0.00037115501732176904, "train_min_lr": 0.00037115501732176904, "train_loss": 0.6225659077366194, "train_loss_scale": 3044903.3846153845, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.051942331620898, "epoch": 308, "n_parameters": 329209088} +{"train_lr": 0.00036371694248936003, "train_min_lr": 0.00036371694248936003, "train_loss": 0.6224814775961045, "train_loss_scale": 2762594.4615384615, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 309, "n_parameters": 329209088} +{"train_lr": 0.00035634293461894045, "train_min_lr": 0.00035634293461894045, "train_loss": 0.6223886375351307, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05241642351477192, "epoch": 310, "n_parameters": 329209088} +{"train_lr": 0.00034903355526975867, "train_min_lr": 0.00034903355526975867, "train_loss": 0.6221648496396553, "train_loss_scale": 1636719.5897435897, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 311, "n_parameters": 329209088} +{"train_lr": 0.00034178936107935213, "train_min_lr": 0.00034178936107935213, "train_loss": 0.6220305213310684, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05290654545220045, "epoch": 312, "n_parameters": 329209088} +{"train_lr": 0.00033461090372115536, "train_min_lr": 0.00033461090372115536, "train_loss": 0.6218693003368875, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05271053246150796, "epoch": 313, "n_parameters": 329209088} +{"train_lr": 0.0003274987298624888, "train_min_lr": 0.0003274987298624888, "train_loss": 0.6217108014493417, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0534327949086825, "epoch": 314, "n_parameters": 329209088} +{"train_lr": 0.0003204533811229274, "train_min_lr": 0.0003204533811229274, "train_loss": 0.6215678360253477, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05305291585719738, "epoch": 315, "n_parameters": 329209088} +{"train_lr": 0.0003134753940330548, "train_min_lr": 0.0003134753940330548, "train_loss": 0.6214922519209676, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.053655578062320367, "epoch": 316, "n_parameters": 329209088} +{"train_lr": 0.00030656529999360446, "train_min_lr": 0.00030656529999360446, "train_loss": 0.6213508629335616, "train_loss_scale": 1078823.3846153845, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.054097659778423034, "epoch": 317, "n_parameters": 329209088} +{"train_lr": 0.0002997236252349912, "train_min_lr": 0.0002997236252349912, "train_loss": 0.6212262413285387, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05381574500829746, "epoch": 318, "n_parameters": 329209088} +{"train_lr": 0.00029295089077723625, "train_min_lr": 0.00029295089077723625, "train_loss": 0.6209992122872231, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05416175249056556, "epoch": 319, "n_parameters": 329209088} +{"train_lr": 0.00028624761239028984, "train_min_lr": 0.00028624761239028984, "train_loss": 0.620830089540388, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.055238924717578367, "epoch": 320, "n_parameters": 329209088} +{"train_lr": 0.0002796143005547551, "train_min_lr": 0.0002796143005547551, "train_loss": 0.6207279697096405, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05459649208933115, "epoch": 321, "n_parameters": 329209088} +{"train_lr": 0.00027305146042300914, "train_min_lr": 0.00027305146042300914, "train_loss": 0.6206980775683545, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05529561992257069, "epoch": 322, "n_parameters": 329209088} +{"train_lr": 0.0002665595917807374, "train_min_lr": 0.0002665595917807374, "train_loss": 0.6204912179113867, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05496582534546272, "epoch": 323, "n_parameters": 329209088} +{"train_lr": 0.00026013918900887165, "train_min_lr": 0.00026013918900887165, "train_loss": 0.6202278277084518, "train_loss_scale": 3394428.717948718, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.054788462817668915, "epoch": 324, "n_parameters": 329209088} +{"train_lr": 0.00025379074104594005, "train_min_lr": 0.00025379074104594005, "train_loss": 0.6201493390000019, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05601183489824717, "epoch": 325, "n_parameters": 329209088} +{"train_lr": 0.00024751473135083417, "train_min_lr": 0.00024751473135083417, "train_loss": 0.620142216125551, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05611464600914564, "epoch": 326, "n_parameters": 329209088} +{"train_lr": 0.00024131163786599073, "train_min_lr": 0.00024131163786599073, "train_loss": 0.6198461944213471, "train_loss_scale": 3044903.3846153845, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 327, "n_parameters": 329209088} +{"train_lr": 0.0002351819329809949, "train_min_lr": 0.0002351819329809949, "train_loss": 0.6197900402311904, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05608035231200166, "epoch": 328, "n_parameters": 329209088} +{"train_lr": 0.00022912608349660654, "train_min_lr": 0.00022912608349660654, "train_loss": 0.6196882854908322, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05585487320637091, "epoch": 329, "n_parameters": 329209088} +{"train_lr": 0.0002231445505892088, "train_min_lr": 0.0002231445505892088, "train_loss": 0.6195478991378481, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05689439606160308, "epoch": 330, "n_parameters": 329209088} +{"train_lr": 0.00021723778977569177, "train_min_lr": 0.00021723778977569177, "train_loss": 0.6194377177729248, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.057099204223889574, "epoch": 331, "n_parameters": 329209088} +{"train_lr": 0.00021140625087876029, "train_min_lr": 0.00021140625087876029, "train_loss": 0.6191904116780139, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.057785087277014285, "epoch": 332, "n_parameters": 329209088} +{"train_lr": 0.0002056503779926791, "train_min_lr": 0.0002056503779926791, "train_loss": 0.6190972226158453, "train_loss_scale": 2386182.564102564, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.056569273225389995, "epoch": 333, "n_parameters": 329209088} +{"train_lr": 0.00019997060944945298, "train_min_lr": 0.00019997060944945298, "train_loss": 0.6189060840600481, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05774792314817508, "epoch": 334, "n_parameters": 329209088} +{"train_lr": 0.00019436737778544695, "train_min_lr": 0.00019436737778544695, "train_loss": 0.6188764330895188, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05788267756071992, "epoch": 335, "n_parameters": 329209088} +{"train_lr": 0.00018884110970844586, "train_min_lr": 0.00018884110970844586, "train_loss": 0.6188582589384168, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05812949151135026, "epoch": 336, "n_parameters": 329209088} +{"train_lr": 0.00018339222606515945, "train_min_lr": 0.00018339222606515945, "train_loss": 0.6185744639175633, "train_loss_scale": 2312244.512820513, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 337, "n_parameters": 329209088} +{"train_lr": 0.0001780211418091735, "train_min_lr": 0.0001780211418091735, "train_loss": 0.61838981260856, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05827353232038709, "epoch": 338, "n_parameters": 329209088} +{"train_lr": 0.0001727282659693489, "train_min_lr": 0.0001727282659693489, "train_loss": 0.6183419898331451, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05899448341761644, "epoch": 339, "n_parameters": 329209088} +{"train_lr": 0.00016751400161867363, "train_min_lr": 0.00016751400161867363, "train_loss": 0.6181415190848594, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.058398466473684095, "epoch": 340, "n_parameters": 329209088} +{"train_lr": 0.00016237874584356534, "train_min_lr": 0.00016237874584356534, "train_loss": 0.6180627919996206, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.059225902104606994, "epoch": 341, "n_parameters": 329209088} +{"train_lr": 0.00015732288971363336, "train_min_lr": 0.00015732288971363336, "train_loss": 0.6179356266780255, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05831028468524798, "epoch": 342, "n_parameters": 329209088} +{"train_lr": 0.00015234681825189645, "train_min_lr": 0.00015234681825189645, "train_loss": 0.6178475774358958, "train_loss_scale": 3118841.435897436, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.059250366169577226, "epoch": 343, "n_parameters": 329209088} +{"train_lr": 0.0001474509104054623, "train_min_lr": 0.0001474509104054623, "train_loss": 0.6176945664561712, "train_loss_scale": 4194304.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0593567103601228, "epoch": 344, "n_parameters": 329209088} +{"train_lr": 0.00014263553901666844, "train_min_lr": 0.00014263553901666844, "train_loss": 0.6176360222116972, "train_loss_scale": 2298801.230769231, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 345, "n_parameters": 329209088} +{"train_lr": 0.00013790107079468978, "train_min_lr": 0.00013790107079468978, "train_loss": 0.6175331700987254, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.059549868453103, "epoch": 346, "n_parameters": 329209088} +{"train_lr": 0.00013324786628761168, "train_min_lr": 0.00013324786628761168, "train_loss": 0.6173387026915756, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0594609524959173, "epoch": 347, "n_parameters": 329209088} +{"train_lr": 0.00012867627985497265, "train_min_lr": 0.00012867627985497265, "train_loss": 0.6172239957198214, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06062490045307921, "epoch": 348, "n_parameters": 329209088} +{"train_lr": 0.00012418665964077967, "train_min_lr": 0.00012418665964077967, "train_loss": 0.6170887761892607, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06008780175724473, "epoch": 349, "n_parameters": 329209088} +{"train_lr": 0.00011977934754699389, "train_min_lr": 0.00011977934754699389, "train_loss": 0.616958317370751, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05987997680233839, "epoch": 350, "n_parameters": 329209088} +{"train_lr": 0.00011545467920749488, "train_min_lr": 0.00011545467920749488, "train_loss": 0.6167866381076283, "train_loss_scale": 2614718.358974359, "train_weight_decay": 0.05000000000000026, "train_grad_norm": NaN, "epoch": 351, "n_parameters": 329209088} +{"train_lr": 0.00011121298396252067, "train_min_lr": 0.00011121298396252067, "train_loss": 0.6167615295913166, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.060026866574891105, "epoch": 352, "n_parameters": 329209088} +{"train_lr": 0.00010705458483358618, "train_min_lr": 0.00010705458483358618, "train_loss": 0.6167897622948751, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06035228763730862, "epoch": 353, "n_parameters": 329209088} +{"train_lr": 0.00010297979849888524, "train_min_lr": 0.00010297979849888524, "train_loss": 0.616521514313391, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.059711449719869934, "epoch": 354, "n_parameters": 329209088} +{"train_lr": 9.898893526917322e-05, "train_min_lr": 9.898893526917322e-05, "train_loss": 0.6164810129930862, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05972302886538017, "epoch": 355, "n_parameters": 329209088} +{"train_lr": 9.508229906413639e-05, "train_min_lr": 9.508229906413639e-05, "train_loss": 0.6164099339586802, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06167773603915404, "epoch": 356, "n_parameters": 329209088} +{"train_lr": 9.126018738924707e-05, "train_min_lr": 9.126018738924707e-05, "train_loss": 0.616182995452665, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06037763428563873, "epoch": 357, "n_parameters": 329209088} +{"train_lr": 8.752289131310685e-05, "train_min_lr": 8.752289131310685e-05, "train_loss": 0.616092385783887, "train_loss_scale": 3851500.3076923075, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06158848246559501, "epoch": 358, "n_parameters": 329209088} +{"train_lr": 8.387069544528184e-05, "train_min_lr": 8.387069544528184e-05, "train_loss": 0.6160591879608825, "train_loss_scale": 2668491.487179487, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 359, "n_parameters": 329209088} +{"train_lr": 8.030387791462727e-05, "train_min_lr": 8.030387791462727e-05, "train_loss": 0.6159726758308423, "train_loss_scale": 2029935.5897435897, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 360, "n_parameters": 329209088} +{"train_lr": 7.682271034810752e-05, "train_min_lr": 7.682271034810752e-05, "train_loss": 0.6157802377517024, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.061162601523578934, "epoch": 361, "n_parameters": 329209088} +{"train_lr": 7.342745785011076e-05, "train_min_lr": 7.342745785011076e-05, "train_loss": 0.6157825800817077, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06114464859740856, "epoch": 362, "n_parameters": 329209088} +{"train_lr": 7.01183789822599e-05, "train_min_lr": 7.01183789822599e-05, "train_loss": 0.6156796769967351, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.061237632093998864, "epoch": 363, "n_parameters": 329209088} +{"train_lr": 6.689572574372245e-05, "train_min_lr": 6.689572574372245e-05, "train_loss": 0.615686372680685, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06131900971134504, "epoch": 364, "n_parameters": 329209088} +{"train_lr": 6.375974355201949e-05, "train_min_lr": 6.375974355201949e-05, "train_loss": 0.6154456634420711, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06144990777978913, "epoch": 365, "n_parameters": 329209088} +{"train_lr": 6.07106712243363e-05, "train_min_lr": 6.07106712243363e-05, "train_loss": 0.6154503608540369, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06206150356536874, "epoch": 366, "n_parameters": 329209088} +{"train_lr": 5.774874095933571e-05, "train_min_lr": 5.774874095933571e-05, "train_loss": 0.6154025624541996, "train_loss_scale": 1734183.3846153845, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.060618695408965535, "epoch": 367, "n_parameters": 329209088} +{"train_lr": 5.487417831947492e-05, "train_min_lr": 5.487417831947492e-05, "train_loss": 0.6153451637364924, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06061985910846255, "epoch": 368, "n_parameters": 329209088} +{"train_lr": 5.208720221382823e-05, "train_min_lr": 5.208720221382823e-05, "train_loss": 0.6151150952941046, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.060271603688120075, "epoch": 369, "n_parameters": 329209088} +{"train_lr": 4.938802488141633e-05, "train_min_lr": 4.938802488141633e-05, "train_loss": 0.6151448833959129, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06074908639614781, "epoch": 370, "n_parameters": 329209088} +{"train_lr": 4.677685187504342e-05, "train_min_lr": 4.677685187504342e-05, "train_loss": 0.6151111972554085, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06114926136648043, "epoch": 371, "n_parameters": 329209088} +{"train_lr": 4.425388204564351e-05, "train_min_lr": 4.425388204564351e-05, "train_loss": 0.6150053266483622, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06076718278181476, "epoch": 372, "n_parameters": 329209088} +{"train_lr": 4.181930752713709e-05, "train_min_lr": 4.181930752713709e-05, "train_loss": 0.6149013768702459, "train_loss_scale": 2117316.923076923, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 373, "n_parameters": 329209088} +{"train_lr": 3.947331372179967e-05, "train_min_lr": 3.947331372179967e-05, "train_loss": 0.614739190775137, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.060880790082499005, "epoch": 374, "n_parameters": 329209088} +{"train_lr": 3.7216079286142414e-05, "train_min_lr": 3.7216079286142414e-05, "train_loss": 0.6148230330779766, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.060367274730920024, "epoch": 375, "n_parameters": 329209088} +{"train_lr": 3.504777611730673e-05, "train_min_lr": 3.504777611730673e-05, "train_loss": 0.6148029503603585, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06092081310896155, "epoch": 376, "n_parameters": 329209088} +{"train_lr": 3.296856933997393e-05, "train_min_lr": 3.296856933997393e-05, "train_loss": 0.6146716189105064, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.06024718504303541, "epoch": 377, "n_parameters": 329209088} +{"train_lr": 3.097861729379017e-05, "train_min_lr": 3.097861729379017e-05, "train_loss": 0.6146338271896522, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05957711665914991, "epoch": 378, "n_parameters": 329209088} +{"train_lr": 2.9078071521308036e-05, "train_min_lr": 2.9078071521308036e-05, "train_loss": 0.6145035460627136, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.059493583209143996, "epoch": 379, "n_parameters": 329209088} +{"train_lr": 2.726707675644639e-05, "train_min_lr": 2.726707675644639e-05, "train_loss": 0.6144424588419497, "train_loss_scale": 3105398.153846154, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 380, "n_parameters": 329209088} +{"train_lr": 2.554577091346818e-05, "train_min_lr": 2.554577091346818e-05, "train_loss": 0.6144928194057101, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.059755704747751735, "epoch": 381, "n_parameters": 329209088} +{"train_lr": 2.39142850764776e-05, "train_min_lr": 2.39142850764776e-05, "train_loss": 0.6143560699629, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05846842838069185, "epoch": 382, "n_parameters": 329209088} +{"train_lr": 2.2372743489437732e-05, "train_min_lr": 2.2372743489437732e-05, "train_loss": 0.6143771248439757, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05851758097131283, "epoch": 383, "n_parameters": 329209088} +{"train_lr": 2.092126354670879e-05, "train_min_lr": 2.092126354670879e-05, "train_loss": 0.6143331871165011, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05992752422268192, "epoch": 384, "n_parameters": 329209088} +{"train_lr": 1.9559955784107943e-05, "train_min_lr": 1.9559955784107943e-05, "train_loss": 0.6143057309867194, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.058685124350281864, "epoch": 385, "n_parameters": 329209088} +{"train_lr": 1.8288923870491904e-05, "train_min_lr": 1.8288923870491904e-05, "train_loss": 0.6142979655116318, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05840848087786864, "epoch": 386, "n_parameters": 329209088} +{"train_lr": 1.7108264599861833e-05, "train_min_lr": 1.7108264599861833e-05, "train_loss": 0.6142287193439327, "train_loss_scale": 2026574.7692307692, "train_weight_decay": 0.05000000000000026, "train_grad_norm": Infinity, "epoch": 387, "n_parameters": 329209088} +{"train_lr": 1.6018067883992388e-05, "train_min_lr": 1.6018067883992388e-05, "train_loss": 0.6141766291529609, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.057539312300296165, "epoch": 388, "n_parameters": 329209088} +{"train_lr": 1.5018416745584281e-05, "train_min_lr": 1.5018416745584281e-05, "train_loss": 0.6141323318812423, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05734270910183207, "epoch": 389, "n_parameters": 329209088} +{"train_lr": 1.410938731194203e-05, "train_min_lr": 1.410938731194203e-05, "train_loss": 0.6140697077453996, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.057281388996694334, "epoch": 390, "n_parameters": 329209088} +{"train_lr": 1.3291048809176455e-05, "train_min_lr": 1.3291048809176455e-05, "train_loss": 0.6141464590118864, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05875818443317444, "epoch": 391, "n_parameters": 329209088} +{"train_lr": 1.2563463556932867e-05, "train_min_lr": 1.2563463556932867e-05, "train_loss": 0.614029985315238, "train_loss_scale": 1048576.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05735492182131379, "epoch": 392, "n_parameters": 329209088} +{"train_lr": 1.1926686963645178e-05, "train_min_lr": 1.1926686963645178e-05, "train_loss": 0.6140755547676235, "train_loss_scale": 1152761.435897436, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05674764691875913, "epoch": 393, "n_parameters": 329209088} +{"train_lr": 1.138076752231636e-05, "train_min_lr": 1.138076752231636e-05, "train_loss": 0.6140404714343066, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05714714400565777, "epoch": 394, "n_parameters": 329209088} +{"train_lr": 1.0925746806825481e-05, "train_min_lr": 1.0925746806825481e-05, "train_loss": 0.6140535513106256, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05643063993790211, "epoch": 395, "n_parameters": 329209088} +{"train_lr": 1.0561659468761706e-05, "train_min_lr": 1.0561659468761706e-05, "train_loss": 0.6139595152822156, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.0575700929054083, "epoch": 396, "n_parameters": 329209088} +{"train_lr": 1.0288533234785454e-05, "train_min_lr": 1.0288533234785454e-05, "train_loss": 0.6139694767502638, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05775066392305188, "epoch": 397, "n_parameters": 329209088} +{"train_lr": 1.0106388904516885e-05, "train_min_lr": 1.0106388904516885e-05, "train_loss": 0.6139701316610743, "train_loss_scale": 2097152.0, "train_weight_decay": 0.05000000000000026, "train_grad_norm": 0.05698600644245744, "epoch": 398, "n_parameters": 329209088} \ No newline at end of file diff --git a/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_large_0.75_400e_finetune_50e.txt b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_large_0.75_400e_finetune_50e.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd7fc64621b55350dfed8e25e6d50ee0b79e2d5f --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/files/pretrain_large_0.75_400e_finetune_50e.txt @@ -0,0 +1,49 @@ +{"train_lr": 0.0007989756722151088, "train_min_lr": 6.012639153584639e-07, "train_loss": 5.127884587347507, "train_loss_scale": 26869.76, "train_weight_decay": 0.0500000000000005, "train_grad_norm": NaN, "test_loss": 1.4843358747551547, "test_acc1": 64.26543707833866, "test_acc5": 88.16978823429335, "epoch": 0, "n_parameters": 304099304} +{"train_lr": 0.0023994878361075543, "train_min_lr": 1.8057188740092326e-06, "train_loss": 4.216064654672146, "train_loss_scale": 8192.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.533545329284668, "test_loss": 1.2492685344837167, "test_acc1": 69.97760927974606, "test_acc5": 90.96089497957944, "epoch": 1, "n_parameters": 304099304} +{"train_lr": 0.004, "train_min_lr": 3.0101738326600017e-06, "train_loss": 3.981323297941685, "train_loss_scale": 8192.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4938371562957764, "test_loss": 1.2285999098025702, "test_acc1": 71.82901702229212, "test_acc5": 91.87660214402167, "epoch": 2, "n_parameters": 304099304} +{"train_lr": 0.005600512163892445, "train_min_lr": 4.214628791310769e-06, "train_loss": 3.8446675183713435, "train_loss_scale": 8309.9648, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.476758701133728, "test_loss": 1.1472980488529976, "test_acc1": 73.19657960459733, "test_acc5": 92.5523856386342, "epoch": 3, "n_parameters": 304099304} +{"train_lr": 0.007201024327784893, "train_min_lr": 5.419083749961539e-06, "train_loss": 3.761024636930227, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4625869470596313, "test_loss": 1.1038400521501899, "test_acc1": 73.7763938171621, "test_acc5": 92.95025837810392, "epoch": 4, "n_parameters": 304099304} +{"train_lr": 0.007996759735220758, "train_min_lr": 6.0179092252576584e-06, "train_loss": 3.676156573832035, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.421549574661255, "test_loss": 1.0929591273341108, "test_acc1": 74.86404592473768, "test_acc5": 93.32613809049244, "epoch": 5, "n_parameters": 304099304} +{"train_lr": 0.00797730594762296, "train_min_lr": 6.003269404664408e-06, "train_loss": 3.5953619186282157, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4093980346679686, "test_loss": 1.047218256520436, "test_acc1": 75.92370691729599, "test_acc5": 93.67402664347482, "epoch": 6, "n_parameters": 304099304} +{"train_lr": 0.007938477561195478, "train_min_lr": 5.974049356467313e-06, "train_loss": 3.5294715549707414, "train_loss_scale": 29727.1296, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4034552177429198, "test_loss": 1.025527286781546, "test_acc1": 76.45353753095397, "test_acc5": 94.06790079676945, "epoch": 7, "n_parameters": 304099304} +{"train_lr": 0.007880463743933985, "train_min_lr": 5.930391437803984e-06, "train_loss": 3.4760927345097064, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4005069131851196, "test_loss": 0.9883945362971109, "test_acc1": 77.02535457665998, "test_acc5": 94.25983919749562, "epoch": 8, "n_parameters": 304099304} +{"train_lr": 0.007803547133324623, "train_min_lr": 5.872508345665689e-06, "train_loss": 3.4368253495156766, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4067427585601806, "test_loss": 0.9456459115094998, "test_acc1": 77.45121807740884, "test_acc5": 94.43378369043977, "epoch": 9, "n_parameters": 304099304} +{"train_lr": 0.007708102459362675, "train_min_lr": 5.800682080658928e-06, "train_loss": 3.3876734405994413, "train_loss_scale": 47762.6368, "train_weight_decay": 0.0500000000000005, "train_grad_norm": NaN, "test_loss": 0.9333075336454546, "test_acc1": 77.54518795104951, "test_acc5": 94.48976568571665, "epoch": 10, "n_parameters": 304099304} +{"train_lr": 0.0075945947189056, "train_min_lr": 5.7152625731268676e-06, "train_loss": 3.350349271655083, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4042285106658936, "test_loss": 0.914106177385239, "test_acc1": 78.29294876494014, "test_acc5": 94.80766415962103, "epoch": 11, "n_parameters": 304099304} +{"train_lr": 0.007463576910255154, "train_min_lr": 5.616665978323869e-06, "train_loss": 3.3156174855113028, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4055181201934814, "test_loss": 0.9064572855611058, "test_acc1": 78.63883805000393, "test_acc5": 95.03159227252236, "epoch": 12, "n_parameters": 304099304} +{"train_lr": 0.007315687339005228, "train_min_lr": 5.505372648948902e-06, "train_loss": 3.2823165160298347, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4026085538864135, "test_loss": 0.9011619745151085, "test_acc1": 78.9247464817911, "test_acc5": 95.12956087877562, "epoch": 13, "n_parameters": 304099304} +{"train_lr": 0.007151646508281118, "train_min_lr": 5.38192479491553e-06, "train_loss": 3.243818454784155, "train_loss_scale": 62862.1312, "train_weight_decay": 0.0500000000000005, "train_grad_norm": NaN, "test_loss": 0.8837069476242451, "test_acc1": 79.21265436591663, "test_acc5": 95.32349896934348, "epoch": 14, "n_parameters": 304099304} +{"train_lr": 0.006972253608520829, "train_min_lr": 5.246923841759661e-06, "train_loss": 3.2151959358870985, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4083928579330445, "test_loss": 0.8597317022983643, "test_acc1": 79.61452616748333, "test_acc5": 95.33149647163567, "epoch": 15, "n_parameters": 304099304} +{"train_lr": 0.006778382623899437, "train_min_lr": 5.1010275005548245e-06, "train_loss": 3.1898831700742245, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4162667011260985, "test_loss": 0.8535782197928604, "test_acc1": 79.8904376048273, "test_acc5": 95.42546634939491, "epoch": 16, "n_parameters": 304099304} +{"train_lr": 0.006570978074366196, "train_min_lr": 4.944946563609925e-06, "train_loss": 3.162099386626482, "train_loss_scale": 27918.336, "train_weight_decay": 0.0500000000000005, "train_grad_norm": Infinity, "test_loss": 0.8320672392352101, "test_acc1": 80.27631411213792, "test_acc5": 95.5914131579152, "epoch": 17, "n_parameters": 304099304} +{"train_lr": 0.006351050414038195, "train_min_lr": 4.7794414415605505e-06, "train_loss": 3.1394116669654846, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4016926095962525, "test_loss": 0.8336338165270931, "test_acc1": 80.36628525225078, "test_acc5": 95.73336784121172, "epoch": 18, "n_parameters": 304099304} +{"train_lr": 0.006119671108369501, "train_min_lr": 4.605318458724817e-06, "train_loss": 3.109048558318615, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4072301555633544, "test_loss": 0.8483505359686473, "test_acc1": 80.63819816199465, "test_acc5": 95.70537687400481, "epoch": 19, "n_parameters": 304099304} +{"train_lr": 0.00587796741407915, "train_min_lr": 4.423425924772311e-06, "train_loss": 3.0840408909976484, "train_loss_scale": 17956.864, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4081366298675537, "test_loss": 0.8239507561768679, "test_acc1": 80.88212012283633, "test_acc5": 95.83933392641869, "epoch": 20, "n_parameters": 304099304} +{"train_lr": 0.0056271168872697895, "train_min_lr": 4.234650001844681e-06, "train_loss": 3.0596278811216355, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.400708843612671, "test_loss": 0.8104294325937244, "test_acc1": 80.91011133578368, "test_acc5": 95.88531908421507, "epoch": 21, "n_parameters": 304099304} +{"train_lr": 0.005368341646492603, "train_min_lr": 4.0399103872627316e-06, "train_loss": 3.033347837257385, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.4039134342193604, "test_loss": 0.802442946799976, "test_acc1": 81.22401075674338, "test_acc5": 96.16922839452117, "epoch": 22, "n_parameters": 304099304} +{"train_lr": 0.005102902418708659, "train_min_lr": 3.8401558328535635e-06, "train_loss": 3.011972742456198, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3984670780181885, "test_loss": 0.7846099634052199, "test_acc1": 81.6518738484886, "test_acc5": 96.10724816578592, "epoch": 23, "n_parameters": 304099304} +{"train_lr": 0.004832092397153882, "train_min_lr": 3.6363595227269922e-06, "train_loss": 2.982935475295782, "train_loss_scale": 37434.1632, "train_weight_decay": 0.0500000000000005, "train_grad_norm": Infinity, "test_loss": 0.7864276682958007, "test_acc1": 81.68386378855715, "test_acc5": 96.19921868227264, "epoch": 24, "n_parameters": 304099304} +{"train_lr": 0.0045572309410315485, "train_min_lr": 3.4295143320204163e-06, "train_loss": 2.9611134263038634, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3973436738967895, "test_loss": 0.7575029051479172, "test_acc1": 82.05374511738886, "test_acc5": 96.26119886387333, "epoch": 25, "n_parameters": 304099304} +{"train_lr": 0.004279657147726887, "train_min_lr": 3.2206279897109587e-06, "train_loss": 2.9340118052363398, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.395601418876648, "test_loss": 0.7658200605279383, "test_acc1": 82.17570618979074, "test_acc5": 96.33317601474828, "epoch": 26, "n_parameters": 304099304} +{"train_lr": 0.0040007233288593, "train_min_lr": 3.0107181690611663e-06, "train_loss": 2.911110460066795, "train_loss_scale": 16777.216, "train_weight_decay": 0.0500000000000005, "train_grad_norm": Infinity, "test_loss": 0.7580551891861593, "test_acc1": 82.38763844486391, "test_acc5": 96.37716181905165, "epoch": 27, "n_parameters": 304099304} +{"train_lr": 0.003721788421955952, "train_min_lr": 2.800807529617191e-06, "train_loss": 2.8879415444254875, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3968701183319092, "test_loss": 0.7464419478440986, "test_acc1": 82.56158278130295, "test_acc5": 96.4311443963115, "epoch": 28, "n_parameters": 304099304} +{"train_lr": 0.0034442113698448444, "train_min_lr": 2.591918734914248e-06, "train_loss": 2.8644816453993323, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.391160698890686, "test_loss": 0.7561441035941243, "test_acc1": 82.7535212900268, "test_acc5": 96.54110932212873, "epoch": 29, "n_parameters": 304099304} +{"train_lr": 0.0031693445000219155, "train_min_lr": 2.385069470162715e-06, "train_loss": 2.840838378119469, "train_loss_scale": 29097.984, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3920515712738037, "test_loss": 0.7252459700548035, "test_acc1": 83.02943295938269, "test_acc5": 96.56710093904594, "epoch": 30, "n_parameters": 304099304} +{"train_lr": 0.0028985269362474214, "train_min_lr": 2.1812674841880337e-06, "train_loss": 2.8155260333061216, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.39368137550354, "test_loss": 0.7195639022461632, "test_acc1": 83.09741142844055, "test_acc5": 96.72505035739027, "epoch": 31, "n_parameters": 304099304} +{"train_lr": 0.002633078074469583, "train_min_lr": 1.98150567977978e-06, "train_loss": 2.794850654733181, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3816441736221314, "test_loss": 0.7230140563538846, "test_acc1": 83.33933398865463, "test_acc5": 96.76103879531377, "epoch": 32, "n_parameters": 304099304} +{"train_lr": 0.0023742911548601587, "train_min_lr": 1.7867572763690353e-06, "train_loss": 2.7675574938356875, "train_loss_scale": 20185.088, "train_weight_decay": 0.0500000000000005, "train_grad_norm": NaN, "test_loss": 0.7059620880686185, "test_acc1": 83.60724819553104, "test_acc5": 96.85101016194716, "epoch": 33, "n_parameters": 304099304} +{"train_lr": 0.0021234269612783146, "train_min_lr": 1.5979710686011809e-06, "train_loss": 2.748373505538702, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3870051706314086, "test_loss": 0.7138993278255358, "test_acc1": 83.50927951697425, "test_acc5": 96.83901407531036, "epoch": 34, "n_parameters": 304099304} +{"train_lr": 0.0018817076788584007, "train_min_lr": 1.4160668039037374e-06, "train_loss": 2.7253072077810763, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.378683233642578, "test_loss": 0.7143344982691547, "test_acc1": 83.5152777156537, "test_acc5": 96.86500568765139, "epoch": 35, "n_parameters": 304099304} +{"train_lr": 0.0016503109396468272, "train_min_lr": 1.2419307015693537e-06, "train_loss": 2.7030464783608914, "train_loss_scale": 25690.112, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3855178482055663, "test_loss": 0.7060077934361556, "test_acc1": 83.7591995181598, "test_acc5": 97.0069602116971, "epoch": 36, "n_parameters": 304099304} +{"train_lr": 0.001430364085297117, "train_min_lr": 1.0764111351845088e-06, "train_loss": 2.6795597024202347, "train_loss_scale": 22623.0272, "train_weight_decay": 0.0500000000000005, "train_grad_norm": Infinity, "test_loss": 0.7025160825099138, "test_acc1": 83.89515606821612, "test_acc5": 97.01695694804421, "epoch": 37, "n_parameters": 304099304} +{"train_lr": 0.001222938674774683, "train_min_lr": 9.203144994386624e-07, "train_loss": 2.6667806334018707, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3829653573989868, "test_loss": 0.7005323168602499, "test_acc1": 83.971131689489, "test_acc5": 97.02095568797867, "epoch": 38, "n_parameters": 304099304} +{"train_lr": 0.0010290452638292572, "train_min_lr": 7.744012814503836e-07, "train_loss": 2.6435641426086427, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.37545643119812, "test_loss": 0.6936904947163866, "test_acc1": 84.00911955000572, "test_acc5": 97.00895954002117, "epoch": 39, "n_parameters": 304099304} +{"train_lr": 0.0008496284816688692, "train_min_lr": 6.393823557505685e-07, "train_loss": 2.623698795390129, "train_loss_scale": 17694.72, "train_weight_decay": 0.0500000000000005, "train_grad_norm": Infinity, "test_loss": 0.6931187949307701, "test_acc1": 84.12108376296148, "test_acc5": 97.0209558691951, "epoch": 40, "n_parameters": 304099304} +{"train_lr": 0.0006855624288212925, "train_min_lr": 5.159155209731728e-07, "train_loss": 2.6149778491318227, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.376740151977539, "test_loss": 0.6849605971947312, "test_acc1": 84.22904921950855, "test_acc5": 97.01895641685677, "epoch": 41, "n_parameters": 304099304} +{"train_lr": 0.0005376464186041391, "train_min_lr": 4.046022951263863e-07, "train_loss": 2.6032811626434325, "train_loss_scale": 16384.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3765522369384766, "test_loss": 0.6848397854003397, "test_acc1": 84.36500566477052, "test_acc5": 97.08093666069338, "epoch": 42, "n_parameters": 304099304} +{"train_lr": 0.00040660108295072266, "train_min_lr": 3.059849850573709e-07, "train_loss": 2.5880228871881963, "train_loss_scale": 18664.6528, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3685073076248169, "test_loss": 0.6841567384681719, "test_acc1": 84.39699543002928, "test_acc5": 97.08693470515583, "epoch": 43, "n_parameters": 304099304} +{"train_lr": 0.0002930648615637144, "train_min_lr": 2.2054404438780418e-07, "train_loss": 2.5704955228745936, "train_loss_scale": 32768.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3738655996322633, "test_loss": 0.6838053980403963, "test_acc1": 84.40699230000062, "test_acc5": 97.14091745081882, "epoch": 44, "n_parameters": 304099304} +{"train_lr": 0.00019759089150108976, "train_min_lr": 1.486957327921355e-07, "train_loss": 2.567389948529005, "train_loss_scale": 21076.3776, "train_weight_decay": 0.0500000000000005, "train_grad_norm": NaN, "test_loss": 0.6832064684370861, "test_acc1": 84.44298071595848, "test_acc5": 97.1169251150911, "epoch": 45, "n_parameters": 304099304} +{"train_lr": 0.00012064431234798539, "train_min_lr": 9.079008802229141e-08, "train_loss": 2.563270782697201, "train_loss_scale": 11691.6224, "train_weight_decay": 0.0500000000000005, "train_grad_norm": NaN, "test_loss": 0.6835205754648674, "test_acc1": 84.42898519528805, "test_acc5": 97.11492574512387, "epoch": 46, "n_parameters": 304099304} +{"train_lr": 6.260000010339199e-05, "train_min_lr": 4.7109220558935905e-08, "train_loss": 2.559054845803976, "train_loss_scale": 8192.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3721091989517211, "test_loss": 0.6819794948541504, "test_acc1": 84.47097172480856, "test_acc5": 97.15091418487783, "epoch": 47, "n_parameters": 304099304} +{"train_lr": 2.3740740821970863e-05, "train_min_lr": 1.786593919756495e-08, "train_loss": 2.5566883543133736, "train_loss_scale": 8192.0, "train_weight_decay": 0.0500000000000005, "train_grad_norm": 1.3672521203994752, "test_loss": 0.681185031802777, "test_acc1": 84.46097495368255, "test_acc5": 97.15491296325214, "epoch": 48, "n_parameters": 304099304} \ No newline at end of file diff --git a/cv/Self-Supervised Learning/MAE/pytorch/masking_generator.py b/cv/Self-Supervised Learning/MAE/pytorch/masking_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..86e4466419c6ebbe2b88a16845b349afeb04c4c8 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/masking_generator.py @@ -0,0 +1,34 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import random +import math +import numpy as np + +class RandomMaskingGenerator: + def __init__(self, input_size, mask_ratio): + if not isinstance(input_size, tuple): + input_size = (input_size,) * 2 + + self.height, self.width = input_size + + self.num_patches = self.height * self.width + self.num_mask = int(mask_ratio * self.num_patches) + + def __repr__(self): + repr_str = "Maks: total patches {}, mask patches {}".format( + self.num_patches, self.num_mask + ) + return repr_str + + def __call__(self): + mask = np.hstack([ + np.zeros(self.num_patches - self.num_mask), + np.ones(self.num_mask), + ]) + np.random.shuffle(mask) + return mask # [196] \ No newline at end of file diff --git a/cv/Self-Supervised Learning/MAE/pytorch/modeling_finetune.py b/cv/Self-Supervised Learning/MAE/pytorch/modeling_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..be0b7e0403ce702a3d049ffd051e3fb6a8a72651 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/modeling_finetune.py @@ -0,0 +1,339 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import math +from functools import partial +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ +from timm.models.registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + **kwargs + } + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the orignal BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + attn_head_dim=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if init_values > 0: + self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x): + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x))) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + return x + +# sin-cos position encoding +# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31 +def get_sinusoid_encoding_table(n_position, d_hid): + ''' Sinusoid position encoding table ''' + # TODO: make it with torch instead of numpy + def get_position_angle_vec(position): + return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] + + sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + + return torch.FloatTensor(sinusoid_table).unsqueeze(0) + + +class VisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=nn.LayerNorm, + init_values=0., + use_learnable_pos_emb=False, + init_scale=0., + use_mean_pooling=True): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + # self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_learnable_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + else: + # sine-cosine positional embeddings is on the way + self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim) + + self.pos_drop = nn.Dropout(p=drop_rate) + + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values) + for i in range(depth)]) + self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) + self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if use_learnable_pos_emb: + trunc_normal_(self.pos_embed, std=.02) + + # trunc_normal_(self.cls_token, std=.02) + trunc_normal_(self.head.weight, std=.02) + self.apply(self._init_weights) + + self.head.weight.data.mul_(init_scale) + self.head.bias.data.mul_(init_scale) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + B, _, _ = x.size() + + # cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + # x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach() + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + if self.fc_norm is not None: + # return self.fc_norm(x[:, 1:].mean(1)) + return self.fc_norm(x.mean(1)) + else: + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + +@register_model +def vit_small_patch16_224(pretrained=False, **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + +@register_model +def vit_base_patch16_224(pretrained=False, **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + + +@register_model +def vit_base_patch16_384(pretrained=False, **kwargs): + model = VisionTransformer( + img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + + +@register_model +def vit_large_patch16_224(pretrained=False, **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + + +@register_model +def vit_large_patch16_384(pretrained=False, **kwargs): + model = VisionTransformer( + img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + + +@register_model +def vit_large_patch16_512(pretrained=False, **kwargs): + model = VisionTransformer( + img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model diff --git a/cv/Self-Supervised Learning/MAE/pytorch/modeling_pretrain.py b/cv/Self-Supervised Learning/MAE/pytorch/modeling_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..82a899f07283abd7f28163f04fe4d4c65efb3092 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/modeling_pretrain.py @@ -0,0 +1,353 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from functools import partial + +from modeling_finetune import Block, _cfg, PatchEmbed, get_sinusoid_encoding_table +from timm.models.registry import register_model +from timm.models.layers import trunc_normal_ as __call_trunc_normal_ + + +def trunc_normal_(tensor, mean=0., std=1.): + __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std) + + +__all__ = [ + 'pretrain_mae_base_patch16_224', + 'pretrain_mae_large_patch16_224', +] + + +class PretrainVisionTransformerEncoder(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, + use_learnable_pos_emb=False): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + # TODO: Add the cls token + # self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_learnable_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + # sine-cosine positional embeddings + self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if use_learnable_pos_emb: + trunc_normal_(self.pos_embed, std=.02) + + # trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x, mask): + x = self.patch_embed(x) + + # cls_tokens = self.cls_token.expand(batch_size, -1, -1) + # x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed.type_as(x).to(x.device).clone().detach() + + B, _, C = x.shape + x_vis = x[~mask].reshape(B, -1, C) # ~mask means visible + + for blk in self.blocks: + x_vis = blk(x_vis) + + x_vis = self.norm(x_vis) + return x_vis + + def forward(self, x, mask): + x = self.forward_features(x, mask) + x = self.head(x) + return x + +class PretrainVisionTransformerDecoder(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, patch_size=16, num_classes=768, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_patches=196, + ): + super().__init__() + self.num_classes = num_classes + assert num_classes == 3 * patch_size ** 2 + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.patch_size = patch_size + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward(self, x, return_token_num): + for blk in self.blocks: + x = blk(x) + + if return_token_num > 0: + x = self.head(self.norm(x[:, -return_token_num:])) # only return the mask tokens predict pixels + else: + x = self.head(self.norm(x)) # [B, N, 3*16^2] + + return x + +class PretrainVisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, + img_size=224, + patch_size=16, + encoder_in_chans=3, + encoder_num_classes=0, + encoder_embed_dim=768, + encoder_depth=12, + encoder_num_heads=12, + decoder_num_classes=768, + decoder_embed_dim=512, + decoder_depth=8, + decoder_num_heads=8, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=nn.LayerNorm, + init_values=0., + use_learnable_pos_emb=False, + num_classes=0, # avoid the error from create_fn in timm + in_chans=0, # avoid the error from create_fn in timm + ): + super().__init__() + self.encoder = PretrainVisionTransformerEncoder( + img_size=img_size, + patch_size=patch_size, + in_chans=encoder_in_chans, + num_classes=encoder_num_classes, + embed_dim=encoder_embed_dim, + depth=encoder_depth, + num_heads=encoder_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, + norm_layer=norm_layer, + init_values=init_values, + use_learnable_pos_emb=use_learnable_pos_emb) + + self.decoder = PretrainVisionTransformerDecoder( + patch_size=patch_size, + num_patches=self.encoder.patch_embed.num_patches, + num_classes=decoder_num_classes, + embed_dim=decoder_embed_dim, + depth=decoder_depth, + num_heads=decoder_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, + norm_layer=norm_layer, + init_values=init_values) + + self.encoder_to_decoder = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=False) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim)) + + self.pos_embed = get_sinusoid_encoding_table(self.encoder.patch_embed.num_patches, decoder_embed_dim) + + trunc_normal_(self.mask_token, std=.02) + + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token', 'mask_token'} + + def forward(self, x, mask): + + x_vis = self.encoder(x, mask) # [B, N_vis, C_e] + x_vis = self.encoder_to_decoder(x_vis) # [B, N_vis, C_d] + + B, N, C = x_vis.shape + + # we don't unshuffle the correct visible token order, + # but shuffle the pos embedding accorddingly. + expand_pos_embed = self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach() + pos_emd_vis = expand_pos_embed[~mask].reshape(B, -1, C) + pos_emd_mask = expand_pos_embed[mask].reshape(B, -1, C) + x_full = torch.cat([x_vis + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1) + # notice: if N_mask==0, the shape of x is [B, N_mask, 3 * 16 * 16] + x = self.decoder(x_full, pos_emd_mask.shape[1]) # [B, N_mask, 3 * 16 * 16] + + return x + +@register_model +def pretrain_mae_small_patch16_224(pretrained=False, **kwargs): + model = PretrainVisionTransformer( + img_size=224, + patch_size=16, + encoder_embed_dim=384, + encoder_depth=12, + encoder_num_heads=6, + encoder_num_classes=0, + decoder_num_classes=768, + decoder_embed_dim=192, + decoder_depth=4, + decoder_num_heads=3, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + **kwargs) + model.default_cfg = _cfg() + if pretrained: + checkpoint = torch.load( + kwargs["init_ckpt"], map_location="cpu" + ) + model.load_state_dict(checkpoint["model"]) + return model + +@register_model +def pretrain_mae_base_patch16_224(pretrained=False, **kwargs): + model = PretrainVisionTransformer( + img_size=224, + patch_size=16, + encoder_embed_dim=768, + encoder_depth=12, + encoder_num_heads=12, + encoder_num_classes=0, + decoder_num_classes=768, + decoder_embed_dim=384, + decoder_depth=4, + decoder_num_heads=6, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + **kwargs) + model.default_cfg = _cfg() + if pretrained: + checkpoint = torch.load( + kwargs["init_ckpt"], map_location="cpu" + ) + model.load_state_dict(checkpoint["model"]) + return model + + +@register_model +def pretrain_mae_large_patch16_224(pretrained=False, **kwargs): + model = PretrainVisionTransformer( + img_size=224, + patch_size=16, + encoder_embed_dim=1024, + encoder_depth=24, + encoder_num_heads=16, + encoder_num_classes=0, + decoder_num_classes=768, + decoder_embed_dim=512, + decoder_depth=8, + decoder_num_heads=8, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + **kwargs) + model.default_cfg = _cfg() + if pretrained: + checkpoint = torch.load( + kwargs["init_ckpt"], map_location="cpu" + ) + model.load_state_dict(checkpoint["model"]) + return model \ No newline at end of file diff --git a/cv/Self-Supervised Learning/MAE/pytorch/optim_factory.py b/cv/Self-Supervised Learning/MAE/pytorch/optim_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..f5255486d72fb252c49fe4dd391da8c3d8068636 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/optim_factory.py @@ -0,0 +1,182 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import torch +from torch import optim as optim + +from timm.optim.adafactor import Adafactor +from timm.optim.adahessian import Adahessian +from timm.optim.adamp import AdamP +from timm.optim.lookahead import Lookahead +from timm.optim.nadam import Nadam +from timm.optim.novograd import NovoGrad +from timm.optim.nvnovograd import NvNovoGrad +from timm.optim.radam import RAdam +from timm.optim.rmsprop_tf import RMSpropTF +from timm.optim.sgdp import SGDP + +import json + +try: + from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD + has_apex = True +except ImportError: + has_apex = False + + +def get_num_layer_for_vit(var_name, num_max_layer): + if var_name in ("cls_token", "mask_token", "pos_embed"): + return 0 + elif var_name.startswith("patch_embed"): + return 0 + elif var_name.startswith("rel_pos_bias"): + return num_max_layer - 1 + elif var_name.startswith("blocks"): + layer_id = int(var_name.split('.')[1]) + return layer_id + 1 + else: + return num_max_layer - 1 + + +class LayerDecayValueAssigner(object): + def __init__(self, values): + self.values = values + + def get_scale(self, layer_id): + return self.values[layer_id] + + def get_layer_id(self, var_name): + return get_num_layer_for_vit(var_name, len(self.values)) + + +def get_parameter_groups(model, weight_decay=1e-5, skip_list=(), get_num_layer=None, get_layer_scale=None): + parameter_group_names = {} + parameter_group_vars = {} + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: + group_name = "no_decay" + this_weight_decay = 0. + else: + group_name = "decay" + this_weight_decay = weight_decay + if get_num_layer is not None: + layer_id = get_num_layer(name) + group_name = "layer_%d_%s" % (layer_id, group_name) + else: + layer_id = None + + if group_name not in parameter_group_names: + if get_layer_scale is not None: + scale = get_layer_scale(layer_id) + else: + scale = 1. + + parameter_group_names[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "lr_scale": scale + } + parameter_group_vars[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "lr_scale": scale + } + + parameter_group_vars[group_name]["params"].append(param) + parameter_group_names[group_name]["params"].append(name) + print("Param groups = %s" % json.dumps(parameter_group_names, indent=2)) + return list(parameter_group_vars.values()) + + +def create_optimizer(args, model, get_num_layer=None, get_layer_scale=None, filter_bias_and_bn=True, skip_list=None): + opt_lower = args.opt.lower() + weight_decay = args.weight_decay + if weight_decay and filter_bias_and_bn: + skip = {} + if skip_list is not None: + skip = skip_list + elif hasattr(model, 'no_weight_decay'): + skip = model.no_weight_decay() + parameters = get_parameter_groups(model, weight_decay, skip, get_num_layer, get_layer_scale) + weight_decay = 0. + else: + parameters = model.parameters() + + if 'fused' in opt_lower: + assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' + + opt_args = dict(lr=args.lr, weight_decay=weight_decay) + if hasattr(args, 'opt_eps') and args.opt_eps is not None: + opt_args['eps'] = args.opt_eps + if hasattr(args, 'opt_betas') and args.opt_betas is not None: + opt_args['betas'] = args.opt_betas + + print("optimizer settings:", opt_args) + + opt_split = opt_lower.split('_') + opt_lower = opt_split[-1] + if opt_lower == 'sgd' or opt_lower == 'nesterov': + opt_args.pop('eps', None) + optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) + elif opt_lower == 'momentum': + opt_args.pop('eps', None) + optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) + elif opt_lower == 'adam': + optimizer = optim.Adam(parameters, **opt_args) + elif opt_lower == 'adamw': + optimizer = optim.AdamW(parameters, **opt_args) + elif opt_lower == 'nadam': + optimizer = Nadam(parameters, **opt_args) + elif opt_lower == 'radam': + optimizer = RAdam(parameters, **opt_args) + elif opt_lower == 'adamp': + optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) + elif opt_lower == 'sgdp': + optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args) + elif opt_lower == 'adadelta': + optimizer = optim.Adadelta(parameters, **opt_args) + elif opt_lower == 'adafactor': + if not args.lr: + opt_args['lr'] = None + optimizer = Adafactor(parameters, **opt_args) + elif opt_lower == 'adahessian': + optimizer = Adahessian(parameters, **opt_args) + elif opt_lower == 'rmsprop': + optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args) + elif opt_lower == 'rmsproptf': + optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args) + elif opt_lower == 'novograd': + optimizer = NovoGrad(parameters, **opt_args) + elif opt_lower == 'nvnovograd': + optimizer = NvNovoGrad(parameters, **opt_args) + elif opt_lower == 'fusedsgd': + opt_args.pop('eps', None) + optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) + elif opt_lower == 'fusedmomentum': + opt_args.pop('eps', None) + optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) + elif opt_lower == 'fusedadam': + optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) + elif opt_lower == 'fusedadamw': + optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) + elif opt_lower == 'fusedlamb': + optimizer = FusedLAMB(parameters, **opt_args) + elif opt_lower == 'fusednovograd': + opt_args.setdefault('betas', (0.95, 0.98)) + optimizer = FusedNovoGrad(parameters, **opt_args) + else: + assert False and "Invalid optimizer" + raise ValueError + + if len(opt_split) > 1: + if opt_split[0] == 'lookahead': + optimizer = Lookahead(optimizer) + + return optimizer diff --git a/cv/Self-Supervised Learning/MAE/pytorch/requirements.txt b/cv/Self-Supervised Learning/MAE/pytorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e73150071c72f98d229d94a88421ed588608c0 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/requirements.txt @@ -0,0 +1,11 @@ +timm==0.4.12 +Pillow +blobfile +mypy +numpy +pytest +requests +einops +tensorboardX +# deepspeed==0.4.0 +scipy diff --git a/cv/Self-Supervised Learning/MAE/pytorch/run.sh b/cv/Self-Supervised Learning/MAE/pytorch/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..65ec609fe1d0bdfa7eab6486170a9db086878d12 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/run.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +set -ex + +# Set the path to save checkpoints +OUTPUT_DIR='output/' +# path to imagenet-1k set +DATA_PATH='/home/datasets/cv/ImageNet_ILSVRC2012' +# path to pretrain model +MODEL_PATH='pretrain/pretrain_mae_vit_base_mask_0.75_400e.pth' + +# batch_size can be adjusted according to the graphics card +OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=8 run_class_finetuning.py \ + --model vit_base_patch16_224 \ + --data_path ${DATA_PATH} \ + --finetune ${MODEL_PATH} \ + --output_dir ${OUTPUT_DIR} \ + --batch_size 128 \ + --opt adamw \ + --opt_betas 0.9 0.999 \ + --weight_decay 0.05 \ + --epochs 100 \ + --dist_eval \ No newline at end of file diff --git a/cv/Self-Supervised Learning/MAE/pytorch/run_class_finetuning.py b/cv/Self-Supervised Learning/MAE/pytorch/run_class_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e8db67c73f10a2d406b1f863efbc2c7de271c2 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/run_class_finetuning.py @@ -0,0 +1,513 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' + +import argparse +import datetime +import numpy as np +import time +import torch +import torch.backends.cudnn as cudnn +import json +import os + +from pathlib import Path +from collections import OrderedDict + +from timm.data.mixup import Mixup +from timm.models import create_model +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy +from timm.utils import ModelEma +from optim_factory import create_optimizer, get_parameter_groups, LayerDecayValueAssigner + +from datasets import build_dataset +from engine_for_finetuning import train_one_epoch, evaluate +from utils import NativeScalerWithGradNormCount as NativeScaler +import utils +from scipy import interpolate +import modeling_finetune + + +def get_args(): + parser = argparse.ArgumentParser('MAE fine-tuning and evaluation script for image classification', add_help=False) + parser.add_argument('--batch_size', default=64, type=int) + parser.add_argument('--epochs', default=30, type=int) + parser.add_argument('--update_freq', default=1, type=int) + parser.add_argument('--save_ckpt_freq', default=20, type=int) + + # Model parameters + parser.add_argument('--model', default='deit_base_patch16_224', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size') + + parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') + parser.add_argument('--attn_drop_rate', type=float, default=0.0, metavar='PCT', + help='Attention dropout rate (default: 0.)') + parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT', + help='Drop path rate (default: 0.1)') + + parser.add_argument('--disable_eval_during_finetuning', action='store_true', default=False) + + parser.add_argument('--model_ema', action='store_true', default=False) + parser.add_argument('--model_ema_decay', type=float, default=0.9999, help='') + parser.add_argument('--model_ema_force_cpu', action='store_true', default=False, help='') + + # Optimizer parameters + parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "adamw"') + parser.add_argument('--opt_eps', default=1e-8, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: 1e-8)') + parser.add_argument('--opt_betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='SGD momentum (default: 0.9)') + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + parser.add_argument('--weight_decay_end', type=float, default=None, help="""Final value of the + weight decay. We use a cosine schedule for WD and using a larger decay by + the end of training improves performance for ViTs.""") + + parser.add_argument('--lr', type=float, default=1e-3, metavar='LR', + help='learning rate (default: 1e-3)') + parser.add_argument('--layer_decay', type=float, default=0.75) + + parser.add_argument('--warmup_lr', type=float, default=1e-6, metavar='LR', + help='warmup learning rate (default: 1e-6)') + parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') + + parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N', + help='epochs to warmup LR, if scheduler supports') + parser.add_argument('--warmup_steps', type=int, default=-1, metavar='N', + help='num of steps to warmup LR, will overload warmup_epochs if set > 0') + + # Augmentation parameters + parser.add_argument('--color_jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') + parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)'), + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + parser.add_argument('--train_interpolation', type=str, default='bicubic', + help='Training interpolation (random, bilinear, bicubic default: "bicubic")') + + # Evaluation parameters + parser.add_argument('--crop_pct', type=float, default=None) + + # * Random Erase params + parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', + help='Random erase prob (default: 0.25)') + parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "pixel")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + + # * Mixup params + parser.add_argument('--mixup', type=float, default=0.8, + help='mixup alpha, mixup enabled if > 0.') + parser.add_argument('--cutmix', type=float, default=1.0, + help='cutmix alpha, cutmix enabled if > 0.') + parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup_prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup_switch_prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup_mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + + # * Finetuning params + parser.add_argument('--finetune', default='', help='finetune from checkpoint') + parser.add_argument('--model_key', default='model|module', type=str) + parser.add_argument('--model_prefix', default='', type=str) + parser.add_argument('--init_scale', default=0.001, type=float) + parser.add_argument('--use_mean_pooling', action='store_true') + parser.set_defaults(use_mean_pooling=True) + parser.add_argument('--use_cls', action='store_false', dest='use_mean_pooling') + + # Dataset parameters + parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, + help='dataset path') + parser.add_argument('--eval_data_path', default=None, type=str, + help='dataset path for evaluation') + parser.add_argument('--nb_classes', default=1000, type=int, + help='number of the classification types') + parser.add_argument('--imagenet_default_mean_and_std', default=True, action='store_true') + + parser.add_argument('--data_set', default='IMNET', choices=['CIFAR', 'IMNET', 'image_folder'], + type=str, help='ImageNet dataset path') + parser.add_argument('--output_dir', default='', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default=None, + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', + help='resume from checkpoint') + parser.add_argument('--auto_resume', action='store_true') + parser.add_argument('--no_auto_resume', action='store_false', dest='auto_resume') + parser.set_defaults(auto_resume=True) + + parser.add_argument('--save_ckpt', action='store_true') + parser.add_argument('--no_save_ckpt', action='store_false', dest='save_ckpt') + parser.set_defaults(save_ckpt=True) + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true', + help='Perform evaluation only') + parser.add_argument('--dist_eval', action='store_true', default=False, + help='Enabling distributed evaluation') + parser.add_argument('--num_workers', default=10, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', + help='url used to set up distributed training') + + parser.add_argument('--enable_deepspeed', action='store_true', default=False) + + known_args, _ = parser.parse_known_args() + + if known_args.enable_deepspeed: + try: + import deepspeed + from deepspeed import DeepSpeedConfig + parser = deepspeed.add_config_arguments(parser) + ds_init = deepspeed.initialize + except: + print("Please 'pip install deepspeed==0.4.0'") + exit(0) + else: + ds_init = None + + return parser.parse_args(), ds_init + + +def main(args, ds_init): + utils.init_distributed_mode(args) + + if ds_init is not None: + utils.create_ds_config(args) + + print(args) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + utils.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + # random.seed(seed) + + cudnn.benchmark = True + + dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) + if args.disable_eval_during_finetuning: + dataset_val = None + else: + dataset_val, _ = build_dataset(is_train=False, args=args) + + if True: # args.distributed: + num_tasks = utils.get_world_size() + global_rank = utils.get_rank() + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + if args.dist_eval: + if len(dataset_val) % num_tasks != 0: + print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' + 'This will slightly alter validation results as extra duplicate entries are added to achieve ' + 'equal num of samples per-process.') + sampler_val = torch.utils.data.DistributedSampler( + dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) + else: + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + if global_rank == 0 and args.log_dir is not None: + os.makedirs(args.log_dir, exist_ok=True) + log_writer = utils.TensorboardLogger(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True, + ) + + if dataset_val is not None: + data_loader_val = torch.utils.data.DataLoader( + dataset_val, sampler=sampler_val, + batch_size=int(1.5 * args.batch_size), + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=False + ) + else: + data_loader_val = None + + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + print("Mixup is activated!") + mixup_fn = Mixup( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.nb_classes) + + model = create_model( + args.model, + pretrained=False, + num_classes=args.nb_classes, + drop_rate=args.drop, + drop_path_rate=args.drop_path, + attn_drop_rate=args.attn_drop_rate, + drop_block_rate=None, + use_mean_pooling=args.use_mean_pooling, + init_scale=args.init_scale, + ) + + patch_size = model.patch_embed.patch_size + print("Patch size = %s" % str(patch_size)) + args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) + args.patch_size = patch_size + + if args.finetune: + if args.finetune.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.finetune, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.finetune, map_location='cpu') + + print("Load ckpt from %s" % args.finetune) + checkpoint_model = None + for model_key in args.model_key.split('|'): + if model_key in checkpoint: + checkpoint_model = checkpoint[model_key] + print("Load state_dict by model_key = %s" % model_key) + break + if checkpoint_model is None: + checkpoint_model = checkpoint + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + all_keys = list(checkpoint_model.keys()) + new_dict = OrderedDict() + for key in all_keys: + if key.startswith('backbone.'): + new_dict[key[9:]] = checkpoint_model[key] + elif key.startswith('encoder.'): + new_dict[key[8:]] = checkpoint_model[key] + else: + new_dict[key] = checkpoint_model[key] + checkpoint_model = new_dict + + # interpolate position embedding + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed + + utils.load_state_dict(model, checkpoint_model, prefix=args.model_prefix) + # model.load_state_dict(checkpoint_model, strict=False) + + model.to(device) + + model_ema = None + if args.model_ema: + # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper + model_ema = ModelEma( + model, + decay=args.model_ema_decay, + device='cpu' if args.model_ema_force_cpu else '', + resume='') + print("Using EMA with decay = %.8f" % args.model_ema_decay) + + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print("Model = %s" % str(model_without_ddp)) + print('number of params:', n_parameters) + + total_batch_size = args.batch_size * args.update_freq * utils.get_world_size() + num_training_steps_per_epoch = len(dataset_train) // total_batch_size + args.lr = args.lr * total_batch_size / 256 + print("LR = %.8f" % args.lr) + print("Batch size = %d" % total_batch_size) + print("Update frequent = %d" % args.update_freq) + print("Number of training examples = %d" % len(dataset_train)) + print("Number of training training per epoch = %d" % num_training_steps_per_epoch) + + num_layers = model_without_ddp.get_num_layers() + if args.layer_decay < 1.0: + assigner = LayerDecayValueAssigner(list(args.layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2))) + else: + assigner = None + + if assigner is not None: + print("Assigned values = %s" % str(assigner.values)) + + skip_weight_decay_list = model.no_weight_decay() + print("Skip weight decay list: ", skip_weight_decay_list) + + if args.enable_deepspeed: + loss_scaler = None + optimizer_params = get_parameter_groups( + model, args.weight_decay, skip_weight_decay_list, + assigner.get_layer_id if assigner is not None else None, + assigner.get_scale if assigner is not None else None) + model, optimizer, _, _ = ds_init( + args=args, model=model, model_parameters=optimizer_params, dist_init_required=not args.distributed, + ) + + print("model.gradient_accumulation_steps() = %d" % model.gradient_accumulation_steps()) + assert model.gradient_accumulation_steps() == args.update_freq + else: + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) + model_without_ddp = model.module + + optimizer = create_optimizer( + args, model_without_ddp, skip_list=skip_weight_decay_list, + get_num_layer=assigner.get_layer_id if assigner is not None else None, + get_layer_scale=assigner.get_scale if assigner is not None else None) + loss_scaler = NativeScaler() + + print("Use step level LR scheduler!") + lr_schedule_values = utils.cosine_scheduler( + args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch, + warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps, + ) + if args.weight_decay_end is None: + args.weight_decay_end = args.weight_decay + wd_schedule_values = utils.cosine_scheduler( + args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch) + print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values))) + + if mixup_fn is not None: + # smoothing is handled with mixup label transform + criterion = SoftTargetCrossEntropy() + elif args.smoothing > 0.: + criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) + else: + criterion = torch.nn.CrossEntropyLoss() + + print("criterion = %s" % str(criterion)) + + utils.auto_load_model( + args=args, model=model, model_without_ddp=model_without_ddp, + optimizer=optimizer, loss_scaler=loss_scaler, model_ema=model_ema) + + if args.eval: + test_stats = evaluate(data_loader_val, model, device) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + exit(0) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + max_accuracy = 0.0 + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + if log_writer is not None: + log_writer.set_step(epoch * num_training_steps_per_epoch * args.update_freq) + train_stats = train_one_epoch( + model, criterion, data_loader_train, optimizer, + device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, + log_writer=log_writer, start_steps=epoch * num_training_steps_per_epoch, + lr_schedule_values=lr_schedule_values, wd_schedule_values=wd_schedule_values, + num_training_steps_per_epoch=num_training_steps_per_epoch, update_freq=args.update_freq, + ) + if args.output_dir and args.save_ckpt: + if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs: + utils.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch, model_ema=model_ema) + if data_loader_val is not None: + test_stats = evaluate(data_loader_val, model, device) + print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") + if max_accuracy < test_stats["acc1"]: + max_accuracy = test_stats["acc1"] + if args.output_dir and args.save_ckpt: + utils.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch="best", model_ema=model_ema) + + print(f'Max accuracy: {max_accuracy:.2f}%') + if log_writer is not None: + log_writer.update(test_acc1=test_stats['acc1'], head="perf", step=epoch) + log_writer.update(test_acc5=test_stats['acc5'], head="perf", step=epoch) + log_writer.update(test_loss=test_stats['loss'], head="perf", step=epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + else: + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + # **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.output_dir and utils.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + opts, ds_init = get_args() + if opts.output_dir: + Path(opts.output_dir).mkdir(parents=True, exist_ok=True) + main(opts, ds_init) diff --git a/cv/Self-Supervised Learning/MAE/pytorch/run_mae_pretraining.py b/cv/Self-Supervised Learning/MAE/pytorch/run_mae_pretraining.py new file mode 100644 index 0000000000000000000000000000000000000000..0b67975e49ff74f452f37bb7dd478403123fb4e5 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/run_mae_pretraining.py @@ -0,0 +1,265 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' + +import argparse +import datetime +import numpy as np +import time +import torch +import torch.backends.cudnn as cudnn +import json +import os + +from pathlib import Path + +from timm.models import create_model +from optim_factory import create_optimizer + +from datasets import build_pretraining_dataset +from engine_for_pretraining import train_one_epoch +from utils import NativeScalerWithGradNormCount as NativeScaler +import utils +import modeling_pretrain + + +def get_args(): + parser = argparse.ArgumentParser('MAE pre-training script', add_help=False) + parser.add_argument('--batch_size', default=64, type=int) + parser.add_argument('--epochs', default=300, type=int) + parser.add_argument('--save_ckpt_freq', default=20, type=int) + + # Model parameters + parser.add_argument('--model', default='pretrain_mae_base_patch16_224', type=str, metavar='MODEL', + help='Name of model to train') + + parser.add_argument('--mask_ratio', default=0.75, type=float, + help='ratio of the visual tokens/patches need be masked') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size for backbone') + + parser.add_argument('--drop_path', type=float, default=0.0, metavar='PCT', + help='Drop path rate (default: 0.1)') + + parser.add_argument('--normlize_target', default=True, type=bool, + help='normalized the target patch pixels') + + # Optimizer parameters + parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "adamw"') + parser.add_argument('--opt_eps', default=1e-8, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: 1e-8)') + parser.add_argument('--opt_betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='SGD momentum (default: 0.9)') + parser.add_argument('--weight_decay', type=float, default=0.05, + help='weight decay (default: 0.05)') + parser.add_argument('--weight_decay_end', type=float, default=None, help="""Final value of the + weight decay. We use a cosine schedule for WD. + (Set the same value with args.weight_decay to keep weight decay no change)""") + + parser.add_argument('--lr', type=float, default=1.5e-4, metavar='LR', + help='learning rate (default: 1.5e-4)') + parser.add_argument('--warmup_lr', type=float, default=1e-6, metavar='LR', + help='warmup learning rate (default: 1e-6)') + parser.add_argument('--min_lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') + + parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', + help='epochs to warmup LR, if scheduler supports') + parser.add_argument('--warmup_steps', type=int, default=-1, metavar='N', + help='epochs to warmup LR, if scheduler supports') + + # Augmentation parameters + parser.add_argument('--color_jitter', type=float, default=0.0, metavar='PCT', + help='Color jitter factor (default: 0.4)') + parser.add_argument('--train_interpolation', type=str, default='bicubic', + help='Training interpolation (random, bilinear, bicubic default: "bicubic")') + + # Dataset parameters + parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/train', type=str, + help='dataset path') + parser.add_argument('--imagenet_default_mean_and_std', default=True, action='store_true') + + parser.add_argument('--output_dir', default='', + help='path where to save, empty for no saving') + parser.add_argument('--log_dir', default=None, + help='path where to tensorboard log') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=0, type=int) + parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--auto_resume', action='store_true') + parser.add_argument('--no_auto_resume', action='store_false', dest='auto_resume') + parser.set_defaults(auto_resume=True) + + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--num_workers', default=10, type=int) + parser.add_argument('--pin_mem', action='store_true', + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') + parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem', + help='') + parser.set_defaults(pin_mem=True) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--local_rank', default=-1, type=int) + parser.add_argument('--dist_on_itp', action='store_true') + parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') + + return parser.parse_args() + + +def get_model(args): + print(f"Creating model: {args.model}") + model = create_model( + args.model, + pretrained=False, + drop_path_rate=args.drop_path, + drop_block_rate=None, + ) + + return model + + +def main(args): + utils.init_distributed_mode(args) + + print(args) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + utils.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + # random.seed(seed) + + cudnn.benchmark = True + + model = get_model(args) + patch_size = model.encoder.patch_embed.patch_size + print("Patch size = %s" % str(patch_size)) + args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) + args.patch_size = patch_size + + # get dataset + dataset_train = build_pretraining_dataset(args) + + if True: # args.distributed: + num_tasks = utils.get_world_size() + global_rank = utils.get_rank() + sampler_rank = global_rank + num_training_steps_per_epoch = len(dataset_train) // args.batch_size // num_tasks + + sampler_train = torch.utils.data.DistributedSampler( + dataset_train, num_replicas=num_tasks, rank=sampler_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + + if global_rank == 0 and args.log_dir is not None: + os.makedirs(args.log_dir, exist_ok=True) + log_writer = utils.TensorboardLogger(log_dir=args.log_dir) + else: + log_writer = None + + data_loader_train = torch.utils.data.DataLoader( + dataset_train, sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=args.pin_mem, + drop_last=True, + worker_init_fn=utils.seed_worker + ) + + model.to(device) + model_without_ddp = model + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print("Model = %s" % str(model_without_ddp)) + print('number of params: {} M'.format(n_parameters / 1e6)) + + total_batch_size = args.batch_size * utils.get_world_size() + args.lr = args.lr * total_batch_size / 256 + + print("LR = %.8f" % args.lr) + print("Batch size = %d" % total_batch_size) + print("Number of training steps = %d" % num_training_steps_per_epoch) + print("Number of training examples per epoch = %d" % (total_batch_size * num_training_steps_per_epoch)) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) + model_without_ddp = model.module + + optimizer = create_optimizer( + args, model_without_ddp) + loss_scaler = NativeScaler() + + print("Use step level LR & WD scheduler!") + lr_schedule_values = utils.cosine_scheduler( + args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch, + warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps, + ) + if args.weight_decay_end is None: + args.weight_decay_end = args.weight_decay + wd_schedule_values = utils.cosine_scheduler( + args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch) + print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values))) + + utils.auto_load_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + if log_writer is not None: + log_writer.set_step(epoch * num_training_steps_per_epoch) + train_stats = train_one_epoch( + model, data_loader_train, + optimizer, device, epoch, loss_scaler, + args.clip_grad, log_writer=log_writer, + start_steps=epoch * num_training_steps_per_epoch, + lr_schedule_values=lr_schedule_values, + wd_schedule_values=wd_schedule_values, + patch_size=patch_size[0], + normlize_target=args.normlize_target, + ) + if args.output_dir: + if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs: + utils.save_model( + args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, + loss_scaler=loss_scaler, epoch=epoch) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + 'epoch': epoch, 'n_parameters': n_parameters} + + if args.output_dir and utils.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + opts = get_args() + if opts.output_dir: + Path(opts.output_dir).mkdir(parents=True, exist_ok=True) + main(opts) diff --git a/cv/Self-Supervised Learning/MAE/pytorch/run_mae_vis.py b/cv/Self-Supervised Learning/MAE/pytorch/run_mae_vis.py new file mode 100644 index 0000000000000000000000000000000000000000..3ee85a23943ae7a9f6e65399fbcf3a83a7be1ab4 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/run_mae_vis.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +# @Time : 2021/11/18 22:40 +# @Author : zhao pengfei +# @Email : zsonghuan@gmail.com +# @File : run_mae_vis.py +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' + +import argparse +import datetime +import numpy as np +import time +import torch +import torch.backends.cudnn as cudnn +import json +import os + +from PIL import Image + +from pathlib import Path + +from timm.models import create_model + +import utils +import modeling_pretrain +from datasets import DataAugmentationForMAE + +from torchvision.transforms import ToPILImage +from einops import rearrange +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + +def get_args(): + parser = argparse.ArgumentParser('MAE visualization reconstruction script', add_help=False) + parser.add_argument('img_path', type=str, help='input image path') + parser.add_argument('save_path', type=str, help='save image path') + parser.add_argument('model_path', type=str, help='checkpoint path of model') + + parser.add_argument('--input_size', default=224, type=int, + help='images input size for backbone') + parser.add_argument('--device', default='cuda:0', + help='device to use for training / testing') + parser.add_argument('--imagenet_default_mean_and_std', default=True, action='store_true') + parser.add_argument('--mask_ratio', default=0.75, type=float, + help='ratio of the visual tokens/patches need be masked') + # Model parameters + parser.add_argument('--model', default='pretrain_mae_base_patch16_224', type=str, metavar='MODEL', + help='Name of model to vis') + parser.add_argument('--drop_path', type=float, default=0.0, metavar='PCT', + help='Drop path rate (default: 0.1)') + + return parser.parse_args() + + +def get_model(args): + print(f"Creating model: {args.model}") + model = create_model( + args.model, + pretrained=False, + drop_path_rate=args.drop_path, + drop_block_rate=None, + ) + + return model + + +def main(args): + print(args) + + device = torch.device(args.device) + cudnn.benchmark = True + + model = get_model(args) + patch_size = model.encoder.patch_embed.patch_size + print("Patch size = %s" % str(patch_size)) + args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) + args.patch_size = patch_size + + model.to(device) + checkpoint = torch.load(args.model_path, map_location='cpu') + model.load_state_dict(checkpoint['model']) + model.eval() + + with open(args.img_path, 'rb') as f: + img = Image.open(f) + img.convert('RGB') + print("img path:", args.img_path) + + transforms = DataAugmentationForMAE(args) + img, bool_masked_pos = transforms(img) + bool_masked_pos = torch.from_numpy(bool_masked_pos) + + with torch.no_grad(): + img = img[None, :] + bool_masked_pos = bool_masked_pos[None, :] + img = img.to(device, non_blocking=True) + bool_masked_pos = bool_masked_pos.to(device, non_blocking=True).flatten(1).to(torch.bool) + outputs = model(img, bool_masked_pos) + + #save original img + mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device)[None, :, None, None] + std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device)[None, :, None, None] + ori_img = img * std + mean # in [0, 1] + img = ToPILImage()(ori_img[0, :]) + img.save(f"{args.save_path}/ori_img.jpg") + + img_squeeze = rearrange(ori_img, 'b c (h p1) (w p2) -> b (h w) (p1 p2) c', p1=patch_size[0], p2=patch_size[0]) + img_norm = (img_squeeze - img_squeeze.mean(dim=-2, keepdim=True)) / (img_squeeze.var(dim=-2, unbiased=True, keepdim=True).sqrt() + 1e-6) + img_patch = rearrange(img_norm, 'b n p c -> b n (p c)') + img_patch[bool_masked_pos] = outputs + + #make mask + mask = torch.ones_like(img_patch) + mask[bool_masked_pos] = 0 + mask = rearrange(mask, 'b n (p c) -> b n p c', c=3) + mask = rearrange(mask, 'b (h w) (p1 p2) c -> b c (h p1) (w p2)', p1=patch_size[0], p2=patch_size[1], h=14, w=14) + + #save reconstruction img + rec_img = rearrange(img_patch, 'b n (p c) -> b n p c', c=3) + # Notice: To visualize the reconstruction image, we add the predict and the original mean and var of each patch. Issue #40 + rec_img = rec_img * (img_squeeze.var(dim=-2, unbiased=True, keepdim=True).sqrt() + 1e-6) + img_squeeze.mean(dim=-2, keepdim=True) + rec_img = rearrange(rec_img, 'b (h w) (p1 p2) c -> b c (h p1) (w p2)', p1=patch_size[0], p2=patch_size[1], h=14, w=14) + img = ToPILImage()(rec_img[0, :].clip(0,0.996)) + img.save(f"{args.save_path}/rec_img.jpg") + + #save random mask img + img_mask = rec_img * mask + img = ToPILImage()(img_mask[0, :]) + img.save(f"{args.save_path}/mask_img.jpg") + +if __name__ == '__main__': + opts = get_args() + main(opts) diff --git a/cv/Self-Supervised Learning/MAE/pytorch/transforms.py b/cv/Self-Supervised Learning/MAE/pytorch/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..3f4c21745e7642f792bc64da04673b73483cbc6e --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/transforms.py @@ -0,0 +1,179 @@ +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import torch +import torchvision.transforms.functional as F +from PIL import Image +import warnings +import math +import random +import numpy as np + + +class ToNumpy: + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return np_img + + +class ToTensor: + + def __init__(self, dtype=torch.float32): + self.dtype = dtype + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return torch.from_numpy(np_img).to(dtype=self.dtype) + + +_pil_interpolation_to_str = { + Image.NEAREST: 'PIL.Image.NEAREST', + Image.BILINEAR: 'PIL.Image.BILINEAR', + Image.BICUBIC: 'PIL.Image.BICUBIC', + Image.LANCZOS: 'PIL.Image.LANCZOS', + Image.HAMMING: 'PIL.Image.HAMMING', + Image.BOX: 'PIL.Image.BOX', +} + + +def _pil_interp(method): + if method == 'bicubic': + return Image.BICUBIC + elif method == 'lanczos': + return Image.LANCZOS + elif method == 'hamming': + return Image.HAMMING + else: + # default bilinear, do we want to allow nearest? + return Image.BILINEAR + + +_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) + + +class RandomResizedCropAndInterpolationWithTwoPic: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', second_interpolation='lanczos'): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if second_size is not None: + if isinstance(second_size, tuple): + self.second_size = second_size + else: + self.second_size = (second_size, second_size) + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + if interpolation == 'random': + self.interpolation = _RANDOM_INTERPOLATION + else: + self.interpolation = _pil_interp(interpolation) + self.second_interpolation = _pil_interp(second_interpolation) + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Randomly cropped and resized image. + """ + i, j, h, w = self.get_params(img, self.scale, self.ratio) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + if self.second_size is None: + return F.resized_crop(img, i, j, h, w, self.size, interpolation) + else: + return F.resized_crop(img, i, j, h, w, self.size, interpolation), \ + F.resized_crop(img, i, j, h, w, self.second_size, self.second_interpolation) + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + else: + interpolate_str = _pil_interpolation_to_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0}'.format(interpolate_str) + if self.second_size is not None: + format_string += ', second_size={0}'.format(self.second_size) + format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation]) + format_string += ')' + return format_string diff --git a/cv/Self-Supervised Learning/MAE/pytorch/utils.py b/cv/Self-Supervised Learning/MAE/pytorch/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..24ef4d31feef52ff499074a2561d0d20f590eb42 --- /dev/null +++ b/cv/Self-Supervised Learning/MAE/pytorch/utils.py @@ -0,0 +1,512 @@ +# -------------------------------------------------------- +# -------------------------------------------------------- +# Based on BEiT, timm, DINO and DeiT code bases +# https://github.com/microsoft/unilm/tree/master/beit +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +import io +import os +import math +import time +import json +from collections import defaultdict, deque +import datetime +import numpy as np +from timm.utils import get_state_dict + +from pathlib import Path + +import torch +import torch.distributed as dist +from torch._six import inf + +import random + +from tensorboardX import SummaryWriter + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float32, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + log_msg = [ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ] + if torch.cuda.is_available(): + log_msg.append('max mem: {memory:.0f}') + log_msg = self.delimiter.join(log_msg) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +class TensorboardLogger(object): + def __init__(self, log_dir): + self.writer = SummaryWriter(logdir=log_dir) + self.step = 0 + + def set_step(self, step=None): + if step is not None: + self.step = step + else: + self.step += 1 + + def update(self, head='scalar', step=None, **kwargs): + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.writer.add_scalar(head + "/" + k, v, self.step if step is None else step) + + def flush(self): + self.writer.flush() + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +def _load_checkpoint_for_ema(model_ema, checkpoint): + """ + Workaround for ModelEma._load_checkpoint to accept an already-loaded object + """ + mem_file = io.BytesIO() + torch.save(checkpoint, mem_file) + mem_file.seek(0) + model_ema._load_checkpoint(mem_file) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + if args.dist_on_itp: + args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) + os.environ['LOCAL_RANK'] = str(args.gpu) + os.environ['RANK'] = str(args.rank) + os.environ['WORLD_SIZE'] = str(args.world_size) + # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] + elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}, gpu {}'.format( + args.rank, args.dist_url, args.gpu), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +def load_state_dict(model, state_dict, prefix='', ignore_missing="relative_position_index"): + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get( + prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(model, prefix=prefix) + + warn_missing_keys = [] + ignore_missing_keys = [] + for key in missing_keys: + keep_flag = True + for ignore_key in ignore_missing.split('|'): + if ignore_key in key: + keep_flag = False + break + if keep_flag: + warn_missing_keys.append(key) + else: + ignore_missing_keys.append(key) + + missing_keys = warn_missing_keys + + if len(missing_keys) > 0: + print("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + print("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if len(ignore_missing_keys) > 0: + print("Ignored weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, ignore_missing_keys)) + if len(error_msgs) > 0: + print('\n'.join(error_msgs)) + + +class NativeScalerWithGradNormCount: + state_dict_key = "amp_scaler" + + def __init__(self): + self._scaler = torch.cuda.amp.GradScaler() + + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + self._scaler.scale(loss).backward(create_graph=create_graph) + if update_grad: + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + else: + self._scaler.unscale_(optimizer) + norm = get_grad_norm_(parameters) + self._scaler.step(optimizer) + self._scaler.update() + else: + norm = None + return norm + + def state_dict(self): + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): + self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + if len(parameters) == 0: + return torch.tensor(0.) + device = parameters[0].grad.device + if norm_type == inf: + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + return total_norm + + +def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, + start_warmup_value=0, warmup_steps=-1): + warmup_schedule = np.array([]) + warmup_iters = warmup_epochs * niter_per_ep + if warmup_steps > 0: + warmup_iters = warmup_steps + print("Set warmup steps = %d" % warmup_iters) + if warmup_epochs > 0: + warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) + + iters = np.arange(epochs * niter_per_ep - warmup_iters) + schedule = np.array( + [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters]) + + schedule = np.concatenate((warmup_schedule, schedule)) + + assert len(schedule) == epochs * niter_per_ep + return schedule + + +def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, model_ema=None): + output_dir = Path(args.output_dir) + epoch_name = str(epoch) + if loss_scaler is not None: + checkpoint_paths = [output_dir / ('checkpoint-%s.pth' % epoch_name)] + for checkpoint_path in checkpoint_paths: + to_save = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'epoch': epoch, + 'scaler': loss_scaler.state_dict(), + 'args': args, + } + + if model_ema is not None: + to_save['model_ema'] = get_state_dict(model_ema) + + save_on_master(to_save, checkpoint_path) + else: + client_state = {'epoch': epoch} + if model_ema is not None: + client_state['model_ema'] = get_state_dict(model_ema) + model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state) + + +def auto_load_model(args, model, model_without_ddp, optimizer, loss_scaler, model_ema=None): + output_dir = Path(args.output_dir) + if loss_scaler is not None: + # torch.amp + if args.auto_resume and len(args.resume) == 0: + import glob + all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*.pth')) + latest_ckpt = -1 + for ckpt in all_checkpoints: + t = ckpt.split('-')[-1].split('.')[0] + if t.isdigit(): + latest_ckpt = max(int(t), latest_ckpt) + if latest_ckpt >= 0: + args.resume = os.path.join(output_dir, 'checkpoint-%d.pth' % latest_ckpt) + print("Auto resume checkpoint: %s" % args.resume) + + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + print("Resume checkpoint %s" % args.resume) + if 'optimizer' in checkpoint and 'epoch' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + args.start_epoch = checkpoint['epoch'] + 1 + if hasattr(args, 'model_ema') and args.model_ema: + _load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) + if 'scaler' in checkpoint: + loss_scaler.load_state_dict(checkpoint['scaler']) + print("With optim & sched!") + else: + # deepspeed, only support '--auto_resume'. + if args.auto_resume: + import glob + all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*')) + latest_ckpt = -1 + for ckpt in all_checkpoints: + t = ckpt.split('-')[-1].split('.')[0] + if t.isdigit(): + latest_ckpt = max(int(t), latest_ckpt) + if latest_ckpt >= 0: + args.resume = os.path.join(output_dir, 'checkpoint-%d' % latest_ckpt) + print("Auto resume checkpoint: %d" % latest_ckpt) + _, client_states = model.load_checkpoint(args.output_dir, tag='checkpoint-%d' % latest_ckpt) + args.start_epoch = client_states['epoch'] + 1 + if model_ema is not None: + if args.model_ema: + _load_checkpoint_for_ema(model_ema, client_states['model_ema']) + + +def create_ds_config(args): + args.deepspeed_config = os.path.join(args.output_dir, "deepspeed_config.json") + with open(args.deepspeed_config, mode="w") as writer: + ds_config = { + "train_batch_size": args.batch_size * args.update_freq * get_world_size(), + "train_micro_batch_size_per_gpu": args.batch_size, + "steps_per_print": 1000, + "optimizer": { + "type": "Adam", + "adam_w_mode": True, + "params": { + "lr": args.lr, + "weight_decay": args.weight_decay, + "bias_correction": True, + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-8 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 7, + "loss_scale_window": 128 + } + } + + writer.write(json.dumps(ds_config, indent=2))