diff --git a/docs/source/example.rst b/docs/source/example.rst index 798561db883425623511e9d194f35319597f7dee..1b674a15734aec6f1455bd0a3d5caf728b870d9d 100644 --- a/docs/source/example.rst +++ b/docs/source/example.rst @@ -19,7 +19,7 @@ Below is an example of how to use `BenchNIRS` with a custom convolutional neural from benchnirs.load import load_dataset from benchnirs.process import process_epochs - from benchnirs.learn import machine_learn, deep_learn + from benchnirs.learn import deep_learn ALL_DATA_PATH = '/folder/with/datasets/' # path to the datasets diff --git a/src/benchnirs/learn.py b/src/benchnirs/learn.py index dc79298ed9cd80186c3794952bbcbc0b159cc890..738335140eebdbe52f99dd9fd2d2e4e601248587 100644 --- a/src/benchnirs/learn.py +++ b/src/benchnirs/learn.py @@ -164,7 +164,7 @@ def machine_learn(nirs, labels, groups, model, features, normalize=False, all_hps = [] out_split = out_kf.split(nirs, labels, groups) for k, out_idx in enumerate(out_split): - print(f'\tFOLD #{k+1}') + print(f'\tFOLD #{k}') nirs_train, nirs_test = nirs[out_idx[0]], nirs[out_idx[1]] labels_train, labels_test = labels[out_idx[0]], labels[out_idx[1]] @@ -413,6 +413,9 @@ def _train_dl(nirs_train, labels_train, clf, batch_size, lr, max_epoch, break # scheduler.step() + if device_count > 1: + clf = clf.module + results = {'train_losses': train_losses, 'train_accuracies': train_accuracies, 'val_losses': val_losses, @@ -560,19 +563,17 @@ def deep_learn(nirs, labels, groups, model_class, features=None, if features is not None: nirs = _extract_features(nirs, features) + # Outer split if os.path.isfile(f'{output_folder}/split.pickle'): - print('\tSaved training found, loading it...') + print('\tSaved k-fold split found, loading it...', end=' ') with open(f'{output_folder}/split.pickle', 'rb') as f: out_split = pickle.load(f) - print('\tDone!') - + print('Done!') else: if groups is None: out_kf = StratifiedKFold(n_splits=OUTER_K) - in_kf = StratifiedKFold(n_splits=INNER_K) else: out_kf = GroupKFold(n_splits=OUTER_K) - in_kf = GroupKFold(n_splits=INNER_K) out_split = list(out_kf.split(nirs, labels, groups)) if not os.path.isdir(output_folder): @@ -580,57 +581,70 @@ def deep_learn(nirs, labels, groups, model_class, features=None, with open(f'{output_folder}/split.pickle', 'wb') as f: pickle.dump(out_split, f) - for k, out_idx in enumerate(out_split): - print(f'\tTraining outer fold #{k}') - nirs_train = nirs[out_idx[0]] - labels_train = labels[out_idx[0]] + # Inner split + if groups is None: + in_kf = StratifiedKFold(n_splits=INNER_K) + else: + in_kf = GroupKFold(n_splits=INNER_K) - if groups is None: - groups_train = None - nirs_train, labels_train = shuffle( - nirs_train, labels_train, random_state=random_state) - else: - groups_train = groups[out_idx[0]] - nirs_train, labels_train, groups_train = shuffle( - nirs_train, labels_train, groups_train, - random_state=random_state) + for k, out_idx in enumerate(out_split): + print(f'\tTraining outer fold #{k}') + nirs_train = nirs[out_idx[0]] + labels_train = labels[out_idx[0]] - # Min-max scaling - if normalize: - if features is not None: - maxs = nirs_train.max(axis=0)[np.newaxis, :] - mins = nirs_train.min(axis=0)[np.newaxis, :] - else: - maxs = nirs_train.max(axis=(0, 2)) - maxs = maxs[np.newaxis, :, np.newaxis] - mins = nirs_train.min(axis=(0, 2)) - mins = mins[np.newaxis, :, np.newaxis] - nirs_train = (nirs_train - mins) / (maxs - mins) + if groups is None: + groups_train = None + nirs_train, labels_train = shuffle( + nirs_train, labels_train, random_state=random_state) + else: + groups_train = groups[out_idx[0]] + nirs_train, labels_train, groups_train = shuffle( + nirs_train, labels_train, groups_train, + random_state=random_state) + + # Min-max scaling + if normalize: + if features is not None: + maxs = nirs_train.max(axis=0)[np.newaxis, :] + mins = nirs_train.min(axis=0)[np.newaxis, :] + else: + maxs = nirs_train.max(axis=(0, 2)) + maxs = maxs[np.newaxis, :, np.newaxis] + mins = nirs_train.min(axis=(0, 2)) + mins = mins[np.newaxis, :, np.newaxis] + nirs_train = (nirs_train - mins) / (maxs - mins) + if os.path.isfile(f'{output_folder}/model_k{k}.pt'): + print('\t\tClassifier checkpoint found, skipping training') + else: # Train classifier for each combination of hyperparameters hp_list = list(itertools.product(batch_sizes, lrs)) - in_accuracies = [[] for _ in hp_list] - for i, hp in enumerate(hp_list): - batch_size, lr = hp[0], hp[1] - in_split = in_kf.split(nirs_train, labels_train, groups_train) - for in_idx in in_split: - nirs_in_train = nirs_train[in_idx[0]] - labels_in_train = labels_train[in_idx[0]] - nirs_val = nirs_train[in_idx[1]] - labels_val = labels_train[in_idx[1]] - - clf = model_class(n_classes).double() - clf, _ = _train_dl(nirs_in_train, labels_in_train, - clf, batch_size, lr, max_epoch, - early_stop=False, - random_state=random_state) - results = _test_dl(nirs_val, labels_val, clf) - in_accuracies[i].append(results['test_accuracy']) - - # Get best hyperparameters - in_average_accuracies = np.mean(in_accuracies, axis=1) - index_best = np.argmax(in_average_accuracies) - best_hps = hp_list[index_best] + if len(hp_list) > 1: + in_accuracies = [[] for _ in hp_list] + for i, hp in enumerate(hp_list): + batch_size, lr = hp[0], hp[1] + in_split = in_kf.split(nirs_train, labels_train, + groups_train) + for in_idx in in_split: + nirs_in_train = nirs_train[in_idx[0]] + labels_in_train = labels_train[in_idx[0]] + nirs_val = nirs_train[in_idx[1]] + labels_val = labels_train[in_idx[1]] + + clf = model_class(n_classes).double() + clf, _ = _train_dl(nirs_in_train, labels_in_train, + clf, batch_size, lr, max_epoch, + early_stop=False, + random_state=random_state) + results = _test_dl(nirs_val, labels_val, clf) + in_accuracies[i].append(results['test_accuracy']) + + # Get best hyperparameters + in_average_accuracies = np.mean(in_accuracies, axis=1) + index_best = np.argmax(in_average_accuracies) + best_hps = hp_list[index_best] + else: + best_hps = (batch_sizes[0], lrs[0]) # Retrain with best hyperparameters clf = model_class(n_classes).double() @@ -640,12 +654,29 @@ def deep_learn(nirs, labels, groups, model_class, features=None, random_state=random_state) # Save trained model and training results + clf.cpu() torch.save(clf.state_dict(), f'{output_folder}/model_k{k}.pt') with open(f'{output_folder}/hps_k{k}.pickle', 'wb') as f: pickle.dump(best_hps, f) with open(f'{output_folder}/results_k{k}.pickle', 'wb') as f: pickle.dump(results, f) + # Plot outer fold loss graph + _, ax = plt.subplots(figsize=(12, 6)) + epochs = [epoch for epoch in range(len(results['train_losses']))] + dict_losses = {'Epoch': epochs, + 'Training': results['train_losses'], + 'Validation': results['val_losses']} + df_losses = DataFrame(dict_losses) + df_losses = df_losses.melt( + id_vars=['Epoch'], value_vars=['Training', 'Validation'], + var_name='Condition', value_name='Loss') + sns.lineplot(ax=ax, data=df_losses, y='Loss', x='Epoch', + hue='Condition', estimator=None) + plt.savefig(f'{output_folder}/k{k}_graph.png', + bbox_inches='tight') + plt.close() + all_ks, all_epochs = [], [] all_train_losses, all_val_losses = [], [] all_train_accuracies, all_val_accuracies = [], [] @@ -692,7 +723,7 @@ def deep_learn(nirs, labels, groups, model_class, features=None, results['y_true'], results['y_pred'], average='weighted') additional_metrics.append(prfs[:-1]) - # Plot loss and accuracy graphs + # Plot all loss and accuracy graphs _, axes = plt.subplots(ncols=2, figsize=(16, 6)) dict_losses = {'k': all_ks, 'Epoch': all_epochs, 'Training': all_train_losses, @@ -807,6 +838,10 @@ def _train_encdec(x_train, y_train, encoder, decoder, batch_size, lr, print(f'\t\t>Early stopping after {epoch+1} epochs') break + if device_count > 1: + encoder = encoder.module + decoder = decoder.module + results = {'train_losses': train_losses, 'val_losses': val_losses} return encoder, decoder, results @@ -853,6 +888,8 @@ def _proxy_optim(nirs_train, targets_train, groups_train, enc_class, dec_class, """ Train and optimise encoder-decoder. """ + if random_state: + torch.manual_seed(random_state) if not os.path.isdir(output_folder): os.makedirs(output_folder) @@ -864,30 +901,33 @@ def _proxy_optim(nirs_train, targets_train, groups_train, enc_class, dec_class, # Encoder-decoder optimization hp_list = list(itertools.product(batch_sizes, lrs)) - in_losses = [[] for _ in hp_list] - for i, hp in enumerate(hp_list): - batch_size, lr = hp[0], hp[1] - in_split = in_kf.split(nirs_train, targets_train, groups_train) - for in_idx in in_split: - nirs_in_train = nirs_train[in_idx[0]] - targets_in_train = targets_train[in_idx[0]] - nirs_val = nirs_train[in_idx[1]] - targets_val = targets_train[in_idx[1]] - - encoder = enc_class().double() - decoder = dec_class().double() - encoder, decoder, _ = _train_encdec( - nirs_in_train, targets_in_train, encoder, decoder, - batch_size, lr, max_epoch, early_stop=False, - random_state=random_state) + if len(hp_list) > 1: + in_losses = [[] for _ in hp_list] + for i, hp in enumerate(hp_list): + batch_size, lr = hp[0], hp[1] + in_split = in_kf.split(nirs_train, targets_train, groups_train) + for in_idx in in_split: + nirs_in_train = nirs_train[in_idx[0]] + targets_in_train = targets_train[in_idx[0]] + nirs_val = nirs_train[in_idx[1]] + targets_val = targets_train[in_idx[1]] + + encoder = enc_class().double() + decoder = dec_class().double() + encoder, decoder, _ = _train_encdec( + nirs_in_train, targets_in_train, encoder, decoder, + batch_size, lr, max_epoch, early_stop=False, + random_state=random_state) - results = _test_encdec(nirs_val, targets_val, encoder, decoder) - in_losses[i].append(results['test_loss']) + results = _test_encdec(nirs_val, targets_val, encoder, decoder) + in_losses[i].append(results['test_loss']) - # Get best hyperparameters - in_average_losses = np.mean(in_losses, axis=1) - index_best = np.argmin(in_average_losses) - best_hps = hp_list[index_best] + # Get best hyperparameters + in_average_losses = np.mean(in_losses, axis=1) + index_best = np.argmin(in_average_losses) + best_hps = hp_list[index_best] + else: + best_hps = (batch_sizes[0], lrs[0]) # Retrain with best hyperparameters encoder = enc_class().double() @@ -898,6 +938,8 @@ def _proxy_optim(nirs_train, targets_train, groups_train, enc_class, dec_class, random_state=random_state) # Save trained model and training results + encoder.cpu() + decoder.cpu() torch.save(encoder.state_dict(), f'{output_folder}/encoder.pt') torch.save(decoder.state_dict(), f'{output_folder}/decoder.pt') with open(f'{output_folder}/hps.pickle', 'wb') as f: @@ -906,7 +948,7 @@ def _proxy_optim(nirs_train, targets_train, groups_train, enc_class, dec_class, pickle.dump(results, f) # Plot loss graph - _, ax = plt.subplots(figsize=(16, 6)) + _, ax = plt.subplots(figsize=(12, 6)) epochs = [epoch for epoch in range(len(results['train_losses']))] dict_losses = {'Epoch': epochs, 'Training': results['train_losses'], @@ -1011,6 +1053,9 @@ def deep_transfer_learn(nirs, labels, groups, enc_class, dec_class, ``y_pred`` being the true and the predictions on the specific iteration of the outer cross-validation. """ + if random_state: + torch.manual_seed(random_state) + print(f'Labels: {set(labels)}') if 8 in set(labels): n_classes = len(set(labels)) - 1 # minus unlabelled @@ -1029,99 +1074,126 @@ def deep_transfer_learn(nirs, labels, groups, enc_class, dec_class, if mid_idx.is_integer(): mid_idx = int(mid_idx) + # Outer split if os.path.isfile(f'{output_folder}/split.pickle'): - print('\tSaved training found, loading it...') + print('\tSaved k-fold split found, loading it...', end=' ') with open(f'{output_folder}/split.pickle', 'rb') as f: out_split = pickle.load(f) - print('\tDone!') - + print('Done!') else: if groups is None: out_kf = StratifiedKFold(n_splits=OUTER_K) - in_kf = StratifiedKFold(n_splits=INNER_K) else: out_kf = GroupKFold(n_splits=OUTER_K) - in_kf = GroupKFold(n_splits=INNER_K) out_split = list(out_kf.split(nirs, labels, groups)) if not os.path.isdir(output_folder): - os.mkdir(output_folder) + os.makedirs(output_folder) with open(f'{output_folder}/split.pickle', 'wb') as f: pickle.dump(out_split, f) - for k, out_idx in enumerate(out_split): - print(f'\tTraining outer fold #{k}') - nirs_train = nirs[out_idx[0]] - labels_train = labels[out_idx[0]] + # Inner split + if groups is None: + in_kf = StratifiedKFold(n_splits=INNER_K) + else: + in_kf = GroupKFold(n_splits=INNER_K) + + for k, out_idx in enumerate(out_split): + print(f'\tTraining outer fold #{k}') + nirs_train = nirs[out_idx[0]] + labels_train = labels[out_idx[0]] - if groups is None: - groups_train = None - nirs_train, labels_train = shuffle( - nirs_train, labels_train, random_state=random_state) + if groups is None: + groups_train = None + nirs_train, labels_train = shuffle( + nirs_train, labels_train, random_state=random_state) + else: + groups_train = groups[out_idx[0]] + nirs_train, labels_train, groups_train = shuffle( + nirs_train, labels_train, groups_train, + random_state=random_state) + + # Min-max scaling + if normalize: + if features is not None: + maxs = nirs_train.max(axis=0)[np.newaxis, :] + mins = nirs_train.min(axis=0)[np.newaxis, :] else: - groups_train = groups[out_idx[0]] - nirs_train, labels_train, groups_train = shuffle( - nirs_train, labels_train, groups_train, - random_state=random_state) + maxs = nirs_train.max(axis=(0, 2)) + maxs = maxs[np.newaxis, :, np.newaxis] + mins = nirs_train.min(axis=(0, 2)) + mins = mins[np.newaxis, :, np.newaxis] + nirs_train = (nirs_train - mins) / (maxs - mins) - # Min-max scaling - if normalize: - if features is not None: - maxs = nirs_train.max(axis=0)[np.newaxis, :] - mins = nirs_train.min(axis=0)[np.newaxis, :] - else: - maxs = nirs_train.max(axis=(0, 2)) - maxs = maxs[np.newaxis, :, np.newaxis] - mins = nirs_train.min(axis=(0, 2)) - mins = mins[np.newaxis, :, np.newaxis] - nirs_train = (nirs_train - mins) / (maxs - mins) - - # Train and optimise proxy models - nirs_train_hbo = nirs_train[:, mid_idx:, :] - nirs_train_hbr = nirs_train[:, :mid_idx, :] + # Train and optimise self-supervised models + nirs_train_hbo = nirs_train[:, mid_idx:, :] + nirs_train_hbr = nirs_train[:, :mid_idx, :] + # HbO -> HbR + if os.path.isfile(f'{output_folder}/k{k}/hbo/encoder.pt'): + print('\t\tSaved HbO encoder found, loading it...', end=' ') + enc_hbo = enc_class().double() + enc_hbo.load_state_dict( + torch.load(f'{output_folder}/k{k}/hbo/encoder.pt')) + print('Done!') + else: enc_hbo, _ = _proxy_optim( nirs_train_hbo, nirs_train_hbr, groups_train, enc_class, dec_class, batch_sizes=batch_sizes, lrs=lrs, max_epoch=max_epoch, random_state=random_state, output_folder=f'{output_folder}/k{k}/hbo') + # HbR -> HbO + if os.path.isfile(f'{output_folder}/k{k}/hbr/encoder.pt'): + print('\t\tSaved HbR encoder found, loading it...', end=' ') + enc_hbr = enc_class().double() + enc_hbr.load_state_dict( + torch.load(f'{output_folder}/k{k}/hbr/encoder.pt')) + print('Done!') + else: enc_hbr, _ = _proxy_optim( nirs_train_hbr, nirs_train_hbo, groups_train, enc_class, dec_class, batch_sizes=batch_sizes, lrs=lrs, max_epoch=max_epoch, random_state=random_state, output_folder=f'{output_folder}/k{k}/hbr') + if os.path.isfile(f'{output_folder}/k{k}/clf.pt'): + print('\t\tClassifier checkpoint found, skipping training') + else: # Train classifier for each combination of hyperparameters hp_list = list(itertools.product(batch_sizes, lrs)) - in_accuracies = [[] for _ in hp_list] - for i, hp in enumerate(hp_list): - batch_size, lr = hp[0], hp[1] - in_split = in_kf.split(nirs_train, labels_train, groups_train) - for in_idx in in_split: - nirs_in_train = nirs_train[in_idx[0]] - labels_in_train = labels_train[in_idx[0]] - nirs_val = nirs_train[in_idx[1]] - labels_val = labels_train[in_idx[1]] - - # Remove unlabelled examples - idx_in_train = np.where(np.array(labels_in_train) != 8) - labels_in_train = labels_in_train[idx_in_train] - nirs_in_train = nirs_in_train[idx_in_train] - idx_val = np.where(np.array(labels_val) != 8) - labels_val = labels_val[idx_val] - nirs_val = nirs_val[idx_val] - - clf = model_class(n_classes, enc_hbo, enc_hbr).double() - clf, _ = _train_dl(nirs_in_train, labels_in_train, - clf, batch_size, lr, max_epoch, - early_stop=False, - random_state=random_state) - results = _test_dl(nirs_val, labels_val, clf) - in_accuracies[i].append(results['test_accuracy']) - - # Get best hyperparameters - in_average_accuracies = np.mean(in_accuracies, axis=1) - index_max = np.argmax(in_average_accuracies) - best_hps = hp_list[index_max] + if len(hp_list) > 1: + in_accuracies = [[] for _ in hp_list] + for i, hp in enumerate(hp_list): + batch_size, lr = hp[0], hp[1] + in_split = in_kf.split(nirs_train, labels_train, + groups_train) + for in_idx in in_split: + nirs_in_train = nirs_train[in_idx[0]] + labels_in_train = labels_train[in_idx[0]] + nirs_val = nirs_train[in_idx[1]] + labels_val = labels_train[in_idx[1]] + + # Remove unlabelled examples + idx_in_train = np.where(np.array(labels_in_train) != 8) + labels_in_train = labels_in_train[idx_in_train] + nirs_in_train = nirs_in_train[idx_in_train] + idx_val = np.where(np.array(labels_val) != 8) + labels_val = labels_val[idx_val] + nirs_val = nirs_val[idx_val] + + clf = model_class(n_classes, enc_hbo, enc_hbr).double() + clf, _ = _train_dl(nirs_in_train, labels_in_train, + clf, batch_size, lr, max_epoch, + early_stop=False, + random_state=random_state) + results = _test_dl(nirs_val, labels_val, clf) + in_accuracies[i].append(results['test_accuracy']) + + # Get best hyperparameters + in_average_accuracies = np.mean(in_accuracies, axis=1) + index_max = np.argmax(in_average_accuracies) + best_hps = hp_list[index_max] + else: + best_hps = (batch_sizes[0], lrs[0]) # Retrain with best hyperparameters idx_train = np.where(np.array(labels_train) != 8) @@ -1134,12 +1206,29 @@ def deep_transfer_learn(nirs, labels, groups, enc_class, dec_class, random_state=random_state) # Save trained model and training results + clf.cpu() torch.save(clf.state_dict(), f'{output_folder}/k{k}/clf.pt') with open(f'{output_folder}/k{k}/hps.pickle', 'wb') as f: pickle.dump(best_hps, f) with open(f'{output_folder}/k{k}/results.pickle', 'wb') as f: pickle.dump(results, f) + # Plot outer fold loss graph + _, ax = plt.subplots(figsize=(12, 6)) + epochs = [epoch for epoch in range(len(results['train_losses']))] + dict_losses = {'Epoch': epochs, + 'Training': results['train_losses'], + 'Validation': results['val_losses']} + df_losses = DataFrame(dict_losses) + df_losses = df_losses.melt( + id_vars=['Epoch'], value_vars=['Training', 'Validation'], + var_name='Condition', value_name='Loss') + sns.lineplot(ax=ax, data=df_losses, y='Loss', x='Epoch', + hue='Condition', estimator=None) + plt.savefig(f'{output_folder}/k{k}/clf_graph.png', + bbox_inches='tight') + plt.close() + all_ks, all_epochs = [], [] all_train_losses, all_val_losses = [], [] all_train_accuracies, all_val_accuracies = [], [] @@ -1195,7 +1284,7 @@ def deep_transfer_learn(nirs, labels, groups, enc_class, dec_class, results['y_true'], results['y_pred'], average='weighted') additional_metrics.append(prfs[:-1]) - # Plot loss and accuracy graphs + # Plot all loss and accuracy graphs _, axes = plt.subplots(ncols=2, figsize=(16, 6)) dict_losses = {'k': all_ks, 'Epoch': all_epochs, 'Training': all_train_losses, diff --git a/src/custom_model.py b/src/custom_model.py index 58dab32b23ec502008694c60ba2ae302fa9ed625..c5bd321cfa045bdec1ab4687dbaca74e8a25c8d2 100644 --- a/src/custom_model.py +++ b/src/custom_model.py @@ -12,7 +12,7 @@ from scipy import stats from benchnirs.load import load_dataset from benchnirs.process import process_epochs -from benchnirs.learn import machine_learn, deep_learn +from benchnirs.learn import deep_learn ALL_DATA_PATH = '/folder/with/datasets/' # path to the datasets