/experiment_meow_main.py (6e113ad7abec0f20ba45f851c87418d4a1ceadb7) (5987 bytes) (mode 100644) (type blob)

from comet_ml import Experiment

from args_util import meow_parse
from data_flow import get_dataloader, create_image_list
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss
from ignite.handlers import Checkpoint, DiskSaver
from crowd_counting_error_metrics import CrowdCountingMeanAbsoluteError, CrowdCountingMeanSquaredError
from visualize_util import get_readable_time

import torch
from torch import nn
from models.meow_experiment.kitten_meow_1 import M1, M2, M3
from models.meow_experiment.ccnn_tail import BigTailM1, BigTailM2
from models import CustomCNNv2
import os
from model_util import get_lr

COMET_ML_API = "S3mM1eMq6NumMxk2QJAXASkUM"
PROJECT_NAME = "meow-one-experiment-insita"


def very_simple_param_count(model):
    result = sum([p.numel() for p in model.parameters()])
    return result


if __name__ == "__main__":
    experiment = Experiment(project_name=PROJECT_NAME, api_key=COMET_ML_API)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    args = meow_parse()
    print(args)

    experiment.set_name(args.task_id)
    experiment.set_cmd_args()
    experiment.log_text(args.note)

    DATA_PATH = args.input
    TRAIN_PATH = os.path.join(DATA_PATH, "train_data")
    TEST_PATH = os.path.join(DATA_PATH, "test_data")
    dataset_name = args.datasetname
    if dataset_name=="shanghaitech":
        print("will use shanghaitech dataset with crop ")
    elif dataset_name == "shanghaitech_keepfull":
        print("will use shanghaitech_keepfull")
    else:
        print("cannot detect dataset_name")
        print("current dataset_name is ", dataset_name)

    # create list
    train_list = create_image_list(TRAIN_PATH)
    test_list = create_image_list(TEST_PATH)

    # create data loader
    train_loader, val_loader, test_loader = get_dataloader(train_list, None, test_list, dataset_name=dataset_name, batch_size=args.batch_size)

    print("len train_loader ", len(train_loader))

    # model
    model_name = args.model
    experiment.log_other("model", model_name)
    if model_name == "M1":
        model = M1()
    elif model_name == "M2":
        model = M2()
    elif model_name == "M3":
        model = M3()
    elif model_name == "CustomCNNv2":
        model = CustomCNNv2()
    elif model_name == "BigTailM1":
        model = BigTailM1()
    elif model_name == "BigTailM2":
        model = BigTailM2()
    else:
        print("error: you didn't pick a model")
        exit(-1)
    n_param = very_simple_param_count(model)
    experiment.log_other("n_param", n_param)
    if hasattr(model, 'model_note'):
        experiment.log_other("model_note", model.model_note)
    model = model.to(device)

    # loss function
    loss_fn = nn.MSELoss(reduction='sum').to(device)

    optimizer = torch.optim.Adam(model.parameters(), args.lr,
                                weight_decay=args.decay)

    trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'mae': CrowdCountingMeanAbsoluteError(),
                                                'mse': CrowdCountingMeanSquaredError(),
                                                'loss': Loss(loss_fn)
                                            }, device=device)
    print(model)

    print(args)

    if len(args.load_model) > 0:
        load_model_path = args.load_model
        print("load mode " + load_model_path)
        to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer}
        checkpoint = torch.load(load_model_path)
        Checkpoint.load_objects(to_load=to_load, checkpoint=checkpoint)
        print("load model complete")
        for param_group in optimizer.param_groups:
            param_group['lr'] = args.lr
            print("change lr to ", args.lr)
    else:
        print("do not load, keep training")


    @trainer.on(Events.ITERATION_COMPLETED(every=50))
    def log_training_loss(trainer):
        timestamp = get_readable_time()
        print(timestamp + " Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output))


    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        timestamp = get_readable_time()
        print(timestamp + " Training set Results - Epoch: {}  Avg mae: {:.2f} Avg mse: {:.2f} Avg loss: {:.2f}"
              .format(trainer.state.epoch, metrics['mae'], metrics['mse'], metrics['loss']))
        experiment.log_metric("epoch", trainer.state.epoch)
        experiment.log_metric("train_mae", metrics['mae'])
        experiment.log_metric("train_mse", metrics['mse'])
        experiment.log_metric("train_loss", metrics['loss'])
        experiment.log_metric("lr", get_lr(optimizer))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        timestamp = get_readable_time()
        print(timestamp + " Validation set Results - Epoch: {}  Avg mae: {:.2f} Avg mse: {:.2f} Avg loss: {:.2f}"
              .format(trainer.state.epoch, metrics['mae'], metrics['mse'], metrics['loss']))
        experiment.log_metric("valid_mae", metrics['mae'])
        experiment.log_metric("valid_mse", metrics['mse'])
        experiment.log_metric("valid_loss", metrics['loss'])


    # docs on save and load
    to_save = {'trainer': trainer, 'model': model, 'optimizer': optimizer}
    save_handler = Checkpoint(to_save, DiskSaver('saved_model/' + args.task_id, create_dir=True, atomic=True),
                              filename_prefix=args.task_id,
                              n_saved=5)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=5), save_handler)

    trainer.run(train_loader, max_epochs=args.epochs)


Mode Type Size Ref File
100644 blob 61 169fe2b7d512a59cfedf86ddb7ed040173c7434d .gitignore
100644 blob 699 c3455dfa4e1ddcb2e6c28d284dcc3471623e796b README.md
100644 blob 6955 5f8c6f423f9c2c73691ba8fad003ac6631d08347 args_util.py
040000 tree - 5e9d7f0e1fd3a9e4d5a37f3d6de0c3ecd3125af8 backup_notebook
040000 tree - 55d1d196f5b6ed4bfc1e8a715df1cfff1dd18117 bug
100644 blob 1775 1165f1aba0814b448a3595a32bd74f1967509509 crowd_counting_error_metrics.py
100644 blob 28454 c243a8fe2dd9fae9180d1e42742443ce7c5eea33 data_flow.py
040000 tree - 17c9c74641b7acc37008a7f940a62323dd5b2b6b data_util
040000 tree - 2a46ff24b8b8997b4ca07c18e2326cb3c35dc5a0 dataset_script
040000 tree - 9862b9cbc6e7a1d43565f12d85d9b17d1bf1814e env_file
100644 blob 4460 9b254c348a3453f4df2c3ccbf21fb175a16852de eval_context_aware_network.py
100644 blob 428 35cc7bfe48a4ed8dc56635fd3a6763612d8af771 evaluator.py
100644 blob 5987 6e113ad7abec0f20ba45f851c87418d4a1ceadb7 experiment_meow_main.py
100644 blob 1916 1d228fa4fa2887927db069f0c93c61a920279d1f explore_model_summary.py
100644 blob 2718 b09b84e8b761137654ba6904669799c4866554b3 hard_code_variable.py
100644 blob 15300 cb90faba0bd4a45f2606a1e60975ed05bfacdb07 main_pacnn.py
100644 blob 2760 3c2d5ba1c81ef2770ad216c566e268f4ece17262 main_shanghaitech.py
100644 blob 2683 29189260c1a2c03c8e59cd0b4bd61df19d5ce098 main_ucfcc50.py
100644 blob 2039 e52f9798497dd23f9610b99ce00affcdede70082 model_util.py
040000 tree - 5e0aed07e1837613c2eb0a437bf71130943cb85b models
040000 tree - 2cc497edce5da8793879cc5e82718d1562ef17e8 playground
040000 tree - 970ac54d8293aed6667e016f2245547f3a5449c3 pytorch_ssim
100644 blob 1722 9f2869867dd749cf9c68ffd4277cd8ad3785888a sanity_check_dataloader.py
100644 blob 3525 27067234ad3deddd743dcab0d7b3ba4812902656 train_attn_can_adcrowdnet.py
100644 blob 3488 e47bfc7e91c46ca3c61be0c5258302de4730b06d train_attn_can_adcrowdnet_freeze_vgg.py
100644 blob 5352 3ee3269d6fcc7408901af46bed52b1d86ee9818c train_attn_can_adcrowdnet_simple.py
100644 blob 5728 90b846b68f15bdc58e3fd60b41aa4b5d82864ec4 train_attn_can_adcrowdnet_simple_lrscheduler.py
100644 blob 5588 6ee730cd73a9b32b8b16a017c30f21e4399fc55a train_compact_cnn.py
100644 blob 5702 fdec7cd1ee062aa4a2182a91e2fb1bd0db3ab35f train_compact_cnn_lrscheduler.py
100644 blob 5611 2a241c876015db34681d73ce534221de482b0b90 train_compact_cnn_sgd.py
100644 blob 3525 eb52f7a4462687c9b2bf1c3a887014c4afefa26d train_context_aware_network.py
100644 blob 5651 48631e36a1fdc063a6d54d9206d2fd45521d8dc8 train_custom_compact_cnn.py
100644 blob 5594 07d6c9c056db36082545b5b60b1c00d9d9f6396d train_custom_compact_cnn_lrscheduler.py
100644 blob 5281 8a92eb87b54f71ad2a799a7e05020344a22e22d3 train_custom_compact_cnn_sgd.py
040000 tree - 86827b43050017919b2f10e13ee2aa3fdfea35eb train_script
100644 blob 5392 03c78fe177520b309ee21e5c2b7ca67598fad99a visualize_data_loader.py
100644 blob 1146 1b0f845587f0f37166d44fa0c74b51f89cf8b349 visualize_util.py
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/hahattpro/crowd_counting_framework

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/hahattpro/crowd_counting_framework

Clone this repository using git:
git clone git://git.rocketgit.com/user/hahattpro/crowd_counting_framework

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main