Compare text

Find the difference between two text files

Real-time diff

Unified diff

Collapse lines

Highlight change

Syntax highlighting

Tools

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

gdas diff for train_architecture()

Created 3 years agoDiff never expires

Lines
Total
Removed

Words
Total
Removed

To continue using this feature, upgrade to Diffchecker Pro View Pricing

620 lines

Lines
Total
Added

Words
Total
Added

To continue using this feature, upgrade to Diffchecker Pro View Pricing

613 lines

# https://github.com/D-X-Y/AutoDL-Projects/issues/99

import torch

import torch.utils.data

import torch.nn as nn

import torch.nn.functional as F

import torch.optim as optim

import torchvision

import torchvision.transforms as transforms

import tensorflow as tf

# import numpy as np

VISUALIZER = 0

DEBUG = 0

logdir = 'runs/gdas_experiment_1'

if VISUALIZER:

# https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html

from torch.utils.tensorboard import SummaryWriter

# from tensorboardX import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here

writer = SummaryWriter(logdir)

# https://github.com/szagoruyko/pytorchviz

from torchviz import make_dot

if DEBUG:

torch.autograd.set_detect_anomaly(True)

tf.debugging.experimental.enable_dump_debug_info(logdir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1)

USE_CUDA = torch.cuda.is_available()

# https://arxiv.org/pdf/1806.09055.pdf#page=12

TEST_DATASET_RATIO = 0.5 # 50 percent of the dataset is dedicated for testing purpose

BATCH_SIZE = 16

NUM_OF_IMAGE_CHANNELS = 3 # RGB

IMAGE_HEIGHT = 32

IMAGE_WIDTH = 32

NUM_OF_IMAGE_CLASSES = 10

SIZE_OF_HIDDEN_LAYERS = 64

NUM_EPOCHS = 1

LEARNING_RATE = 0.025

MOMENTUM = 0.9

DECAY_FACTOR = 0.001 # for keeping Ltrain within acceptable range

NUM_OF_CELLS = 8

NUM_OF_MIXED_OPS = 4

MIXED_OPS_TENSOR_SHAPE = 4 # shape of the computational kernel used inside each mixed ops

NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2 # last_cell_output , second_last_cell_output

NUM_OF_NODES_IN_EACH_CELL = 5 # including the last node that combines the output from all 4 previous nodes

MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL

NUM_OF_CHANNELS = 16

INTERVAL_BETWEEN_REDUCTION_CELLS = 3

PREVIOUS_PREVIOUS = 2 # (n-2)

REDUCTION_STRIDE = 2

NORMAL_STRIDE = 1

TAU_GUMBEL = 0.5

EDGE_WEIGHTS_NETWORK_IN_SIZE = 5

EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2

# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

transform = transforms.Compose(

[transforms.ToTensor(),

transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,

download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,

shuffle=True, num_workers=2)

valset = torchvision.datasets.CIFAR10(root='./data', train=False,

download=True, transform=transform)

valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,

shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',

'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO))

# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433

# for training for edge weights as well as internal NN function weights

class Edge(nn.Module):

def __init__(self):

super(Edge, self).__init__()

# https://stackoverflow.com/a/51027227/8776167

# self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE)

# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html

self.weights = nn.Parameter(torch.zeros(1),

requires_grad=True) # for edge weights, not for internal NN function weights

# for approximate architecture gradient

self.f_weights = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)

self.f_weights_backup = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)

self.weight_plus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)

self.weight_minus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)

def __freeze_w(self):

self.weights.requires_grad = False

def __unfreeze_w(self):

self.weights.requires_grad = True

def __freeze_f(self):

for param in self.f.parameters():

param.requires_grad = False

def __unfreeze_f(self):

for param in self.f.parameters():

param.requires_grad = True

# for NN functions internal weights training

def forward_f(self, x):

self.__unfreeze_f()

self.__freeze_w()

# inheritance in python classes and SOLID principles

# https://en.wikipedia.org/wiki/SOLID

# https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html

return self.f(x)

# self-defined initial NAS architecture, for supernet architecture edge weight training

def forward_edge(self, x):

self.__freeze_f()

self.__unfreeze_w()

# Refer to GDAS equations (5) and (6)

# if one_hot is already there, would summation be required given that all other entries are forced to 0 ?

# It's not required, but you don't know, which index is one hot encoded 1.

# https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax

# See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163

gumbel = F.gumbel_softmax(x, tau=TAU_GUMBEL, hard=True)

chosen_edge = torch.argmax(gumbel, dim=0) # converts one-hot encoding into integer

return chosen_edge

def forward(self, x, types):

y_hat = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], requires_grad=False)

if USE_CUDA:

y_hat = y_hat.cuda()

if types == "f":

y_hat = self.forward_f(x)

elif types == "edge":

y_hat = self.forward_edge(x)

return y_hat

class ConvEdge(Edge):

def __init__(self, stride):

super().__init__()

self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1)

# Kaiming He weight Initialization

# https://medium.com/@shoray.goel/kaiming-he-initialization-a8d9ed0b5899

nn.init.kaiming_uniform_(self.f.weight, mode='fan_in', nonlinearity='relu')

# class LinearEdge(Edge):

# def __init__(self):

# super().__init__()

# self.f = nn.Linear(84, 10)

class MaxPoolEdge(Edge):

def __init__(self, stride):

super().__init__()

self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)

class AvgPoolEdge(Edge):

def __init__(self, stride):

super().__init__()

self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)

class Skip(nn.Module):

def forward(self, x):

return x

class SkipEdge(Edge):

def __init__(self):

super().__init__()

self.f = Skip()

# to collect and manage different edges between 2 nodes

class Connection(nn.Module):

def __init__(self, stride):

super(Connection, self).__init__()

if USE_CUDA:

# creates distinct edges and references each of them in a list (self.edges)

# self.linear_edge = LinearEdge().cuda()

self.conv2d_edge = ConvEdge(stride).cuda()

self.maxpool_edge = MaxPoolEdge(stride).cuda()

self.avgpool_edge = AvgPoolEdge(stride).cuda()

self.skip_edge = SkipEdge().cuda()

else:

# creates distinct edges and references each of them in a list (self.edges)

# self.linear_edge = LinearEdge()

self.conv2d_edge = ConvEdge(stride)

self.maxpool_edge = MaxPoolEdge(stride)

self.avgpool_edge = AvgPoolEdge(stride)

self.skip_edge = SkipEdge()

# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]

# python list will break the computation graph, need to use nn.ModuleList as a differentiable python list

self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge])

self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)

# self.edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

# requires_grad=False)

# use linear transformation (weighted summation) to combine results from different edges

self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

if USE_CUDA:

self.combined_feature_map = self.combined_feature_map.cuda()

self.combined_edge_map = self.combined_edge_map.cuda()

for e in range(NUM_OF_MIXED_OPS):

with torch.no_grad():

self.edge_weights[e] = self.edges[e].weights

# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions

# self.f_weights[e] = list(self.edges[e].parameters())

def reinit(self):

self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

if USE_CUDA:

self.combined_feature_map = self.combined_feature_map.cuda()

self.combined_edge_map = self.combined_edge_map.cuda()

# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/

# Tensorboard visualization requires a generic forward() function

def forward(self, x, types=None):

edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

if USE_CUDA:

edges_results = edges_results.cuda()

for e in range(NUM_OF_MIXED_OPS):

with torch.no_grad():

edges_results = edges_results + self.edges[e].forward(x, types)

return edges_results * DECAY_FACTOR

# to collect and manage multiple different connections between a particular node and its neighbouring nodes

class Node(nn.Module):

def __init__(self, stride):

super(Node, self).__init__()

# two types of output connections

# Type 1: (multiple edges) output connects to the input of the other intermediate nodes

# Type 2: (single edge) output connects directly to the final output node

# Type 1

self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)])

# Type 2

# depends on PREVIOUS node's Type 1 output

self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False) # for initialization

if USE_CUDA:

self.output = self.output.cuda()

def reinit(self):

self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

if USE_CUDA:

self.output = self.output.cuda()

# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/

# Tensorboard visualization requires a generic forward() function

def forward(self, x, node_num=0, types=None):

value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

# not all nodes have same number of Type-1 output connection

for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - node_num - 1):

y = self.connections[cc].forward(x, types)

# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.

# Tensorflow prefers the use of a new intermediate variable instead of self.variable

value = self.connections[cc].combined_feature_map

if types == "f":

value = self.connections[cc].combined_feature_map

else: # "edge"

value = self.connections[cc].combined_edge_map

# combines all the feature maps from different mixed ops edges

value = value + y # Ltrain(w±, alpha)

# stores the addition result for next for loop index

self.connections[cc].combined_feature_map = value

if types == "f":

self.connections[cc].combined_feature_map = value

decayed_value = value * DECAY_FACTOR

else: # "edge"

self.connections[cc].combined_edge_map = value

if USE_CUDA:

decayed_value = value * DECAY_FACTOR

decayed_value = decayed_value.cuda()

if USE_CUDA:

decayed_value = decayed_value.cuda()

return decayed_value

# to manage all nodes within a cell

class Cell(nn.Module):

def __init__(self, stride):

super(Cell, self).__init__()

# all the coloured edges inside

# https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png

# A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes

# for the k-th node, we have (k+1) preceding nodes.

# Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'),

# is connected to each previous intermediate state

# as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer).

# previous_previous_cell_output = c_{k-2}

# previous_cell_output = c{k-1}

self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)])

# just for variables initialization

self.previous_cell = 0

self.previous_previous_cell = 0

self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

if USE_CUDA:

self.output = self.output.cuda()

def reinit(self):

self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

if USE_CUDA:

self.output = self.output.cuda()

# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/

# Tensorboard visualization requires a generic forward() function

def forward(self, x, x1, x2, c=0, types=None):

value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],

requires_grad=False)

for n in range(NUM_OF_NODES_IN_EACH_CELL):

if c <= 1:

if n == 0:

# Uses datasets as input

# x = train_inputs

if USE_CUDA:

x = x.cuda()

# combines all the feature maps from different mixed ops edges

self.nodes[n].output = \

self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)

else:

# Uses feature map output from previous neighbour nodes for further processing

for ni in range(n):

# nodes[ni] for previous nodes only

# connections[n-ni-1] for neighbour nodes only

x = self.nodes[ni].connections[n-ni-1].combined_feature_map

if types == "f":

x = self.nodes[ni].connections[n-ni-1].combined_feature_map

else: # "edge"

x = self.nodes[ni].connections[n-ni-1].combined_edge_map

# combines all the feature maps from different mixed ops edges

self.nodes[n].output = self.nodes[n].output + \

self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)

else:

if n == 0:

# Uses feature map output from previous neighbour cells for further processing

self.nodes[n].output = \

self.nodes[n].forward(x1, node_num=n, types=types) + \

self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha)

else:

# Uses feature map output from previous neighbour nodes for further processing

for ni in range(n):

# nodes[ni] for previous nodes only

# connections[n-ni-1] for neighbour nodes only

x = self.nodes[ni].connections[n-ni-1].combined_feature_map

if types == "f":

x = self.nodes[ni].connections[n-ni-1].combined_feature_map

else: # "edge"

x = self.nodes[ni].connections[n-ni-1].combined_edge_map

# combines all the feature maps from different mixed ops edges

self.nodes[n].output = self.nodes[n].output + \

self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)

# Uses feature map output from previous neighbour cells for further processing

self.nodes[n].output = self.nodes[n].output + \

self.nodes[n].forward(x1, node_num=n, types=types) + \

self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha)

# 'add' then 'concat' feature maps from different nodes

# needs to take care of tensor dimension mismatch

# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416

# self.output = self.output + self.nodes[n].output

# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.

# Tensorflow prefers the use of a new intermediate variable instead of self.variable

value = self.output

if USE_CUDA:

self.nodes[n].output = self.nodes[n].output.cuda()

value = value.cuda()

value = value + self.nodes[n].output

self.output = value

# to manage all nodes

class Graph(nn.Module):

def __init__(self):

super(Graph, self).__init__()

stride = 1 # just to initialize a variable

# for i in range(NUM_OF_CELLS):

# if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0:

# stride = REDUCTION_STRIDE # to emulate reduction cell by using normal cell with stride=2

# else:

# stride = NORMAL_STRIDE # normal cell

self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)])

self.linears = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)

self.softmax = nn.Softmax(1)

def reinit(self):

# See https://discuss.pytorch.org/t/tensorboard-issue-with-self-defined-forward-function/140628/20?u=promach

for c in range(NUM_OF_CELLS):

self.cells[c].reinit()

for n in range(NUM_OF_NODES_IN_EACH_CELL):

self.cells[c].nodes[n].reinit()

# not all nodes have same number of Type-1 output connection

for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):

self.cells[c].nodes[n].connections[cc].reinit()

def print_debug(self):

for c in range(NUM_OF_CELLS):

for n in range(NUM_OF_NODES_IN_EACH_CELL):

# not all nodes have same number of Type-1 output connection

for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):

for e in range(NUM_OF_MIXED_OPS):

if DEBUG:

print("c = ", c, " , n = ", n, " , cc = ", cc, " , e = ", e)

print("graph.cells[", c, "].nodes[", n, "].connections[", cc,

"].combined_feature_map.grad_fn = ",

self.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn)

print("graph.cells[", c, "].output.grad_fn = ",

self.cells[c].output.grad_fn)

print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ",

self.cells[c].nodes[n].output.grad_fn)

if VISUALIZER == 0:

self.cells[c].nodes[n].output.retain_grad()

print("gradwalk(graph.cells[", c, "].nodes[", n, "].output.grad_fn)")

# gradwalk(graph.cells[c].nodes[n].output.grad_fn)

if DEBUG:

print("graph.cells[", c, "].output.grad_fn = ",

self.cells[c].output.grad_fn)

if VISUALIZER == 0:

self.cells[c].output.retain_grad()

print("gradwalk(graph.cells[", c, "].output.grad_fn)")

# gradwalk(graph.cells[c].output.grad_fn)

# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/

# Tensorboard visualization requires a generic forward() function

def forward(self, x, types=None):

# train_inputs = x

# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/

# https://docs.python.org/3/tutorial/datastructures.html

# generates a supernet consisting of 'NUM_OF_CELLS' cells

# each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes

# refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement

# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html

# encodes the cells and nodes arrangement in the multigraph

outputs1 = 0 # just for initialization, no special meaning

for c in range(NUM_OF_CELLS):

x1 = self.cells[c - 1].output

x2 = self.cells[c - PREVIOUS_PREVIOUS].output

self.cells[c].forward(x, x1, x2, c, types=types)

output_tensor = self.cells[NUM_OF_CELLS - 1].output

output_tensor = output_tensor.view(output_tensor.shape[0], -1)

if USE_CUDA:

output_tensor = output_tensor.cuda()

if DEBUG and VISUALIZER == 0:

print("gradwalk(output_tensor.grad_fn)")

# gradwalk(output_tensor.grad_fn)

if USE_CUDA:

outputs1 = self.linears(output_tensor).cuda()

else:

outputs1 = self.linears(output_tensor)

outputs1 = self.softmax(outputs1)

if USE_CUDA:

outputs1 = outputs1.cuda()

return outputs1

total_grad_out = []

total_grad_in = []

def hook_fn_backward(module, grad_input, grad_output):

print(module) # for distinguishing module

# In order to comply with the order back-propagation, let's print grad_output

print('grad_output', grad_output)

# Reprint grad_input

print('grad_input', grad_input)

# Save to global variables

total_grad_in.append(grad_input)

total_grad_out.append(grad_output)

# for tracking the gradient back-propagation operation

def gradwalk(x, _depth=0):

if hasattr(x, 'grad'):

x = x.grad

if hasattr(x, 'next_functions'):

for fn in x.next_functions:

print(' ' * _depth + str(fn))

gradwalk(fn[0], _depth + 1)

# Function to Convert to ONNX

def Convert_ONNX(model, model_input):

# Export the model

torch.onnx.export(model, # model being run

model_input, # model input (or a tuple for multiple inputs)

"gdas.onnx", # where to save the model

export_params=True, # store the trained parameter weights inside the model file

opset_version=10, # the ONNX version to export the model to

do_constant_folding=True, # whether to execute constant folding for optimization

input_names = ['modelInput'], # the model's input names

output_names = ['modelOutput'], # the model's output names

dynamic_axes={'modelInput': {0: 'batch_size'}, # variable length axes

'modelOutput': {0: 'batch_size'}})

print(" ")

print('Model has been converted to ONNX')

# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html

def train_NN(forward_pass_only):

if DEBUG:

print("Entering train_NN(), forward_pass_only = ", forward_pass_only)

graph = Graph()

if USE_CUDA:

graph = graph.cuda()

if DEBUG:

modules = graph.named_children()

print("modules = ", modules)

if VISUALIZER == 0:

# Tensorboard does not like backward hook

for name, module in graph.named_modules():

module.register_full_backward_hook(hook_fn_backward)

criterion = nn.CrossEntropyLoss()

# criterion = nn.BCELoss()

optimizer1 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

# just for initialization, no special meaning

Ltrain = 0

NN_input = 0

NN_output = torch.tensor(0)

NN_train_labels = 0

for train_data, val_data in (zip(trainloader, valloader)):

NN_input, NN_train_labels = train_data

# val_inputs, val_labels = val_data

if USE_CUDA:

NN_input = NN_input.cuda()

NN_train_labels = NN_train_labels.cuda()

# normalize inputs

NN_input = NN_input / 25

Saved diffs

Original text

Open file

# https://github.com/D-X-Y/AutoDL-Projects/issues/99

import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import tensorflow as tf

# import numpy as np

VISUALIZER = 0
DEBUG = 0
logdir = 'runs/gdas_experiment_1'

if VISUALIZER:
    # https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html
    from torch.utils.tensorboard import SummaryWriter

# from tensorboardX import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
    writer = SummaryWriter(logdir)

# https://github.com/szagoruyko/pytorchviz
    from torchviz import make_dot

if DEBUG:
    torch.autograd.set_detect_anomaly(True)
    tf.debugging.experimental.enable_dump_debug_info(logdir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1)

USE_CUDA = torch.cuda.is_available()

# https://arxiv.org/pdf/1806.09055.pdf#page=12
TEST_DATASET_RATIO = 0.5  # 50 percent of the dataset is dedicated for testing purpose
BATCH_SIZE = 16
NUM_OF_IMAGE_CHANNELS = 3  # RGB
IMAGE_HEIGHT = 32
IMAGE_WIDTH = 32
NUM_OF_IMAGE_CLASSES = 10

SIZE_OF_HIDDEN_LAYERS = 64
NUM_EPOCHS = 1
LEARNING_RATE = 0.025
MOMENTUM = 0.9
DECAY_FACTOR = 0.001  # for keeping Ltrain within acceptable range
NUM_OF_CELLS = 8
NUM_OF_MIXED_OPS = 4
MIXED_OPS_TENSOR_SHAPE = 4  # shape of the computational kernel used inside each mixed ops
NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2  # last_cell_output , second_last_cell_output
NUM_OF_NODES_IN_EACH_CELL = 5  # including the last node that combines the output from all 4 previous nodes
MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL
NUM_OF_CHANNELS = 16
INTERVAL_BETWEEN_REDUCTION_CELLS = 3
PREVIOUS_PREVIOUS = 2  # (n-2)
REDUCTION_STRIDE = 2
NORMAL_STRIDE = 1
TAU_GUMBEL = 0.5
EDGE_WEIGHTS_NETWORK_IN_SIZE = 5
EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2

# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

valset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                      download=True, transform=transform)
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
                                        shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO))

# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433
# for training for edge weights as well as internal NN function weights
class Edge(nn.Module):

def __init__(self):
        super(Edge, self).__init__()

# https://stackoverflow.com/a/51027227/8776167
        # self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE)
        # https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html
        self.weights = nn.Parameter(torch.zeros(1),
                                    requires_grad=True)  # for edge weights, not for internal NN function weights

# for approximate architecture gradient
        self.f_weights = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
        self.f_weights_backup = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
        self.weight_plus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
        self.weight_minus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)

def __freeze_w(self):
        self.weights.requires_grad = False

def __unfreeze_w(self):
        self.weights.requires_grad = True

def __freeze_f(self):
        for param in self.f.parameters():
            param.requires_grad = False

def __unfreeze_f(self):
        for param in self.f.parameters():
            param.requires_grad = True

# for NN functions internal weights training
    def forward_f(self, x):
        self.__unfreeze_f()
        self.__freeze_w()

# inheritance in python classes and SOLID principles
        # https://en.wikipedia.org/wiki/SOLID
        # https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html
        return self.f(x)

# self-defined initial NAS architecture, for supernet architecture edge weight training
    def forward_edge(self, x):
        self.__freeze_f()
        self.__unfreeze_w()

# Refer to GDAS equations (5) and (6)
        # if one_hot is already there, would summation be required given that all other entries are forced to 0 ?
        # It's not required, but you don't know, which index is one hot encoded 1.
        # https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax
        # See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163

gumbel = F.gumbel_softmax(x, tau=TAU_GUMBEL, hard=True)
        chosen_edge = torch.argmax(gumbel, dim=0)  # converts one-hot encoding into integer

return chosen_edge

def forward(self, x, types):
        y_hat = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], requires_grad=False)
        if USE_CUDA:
            y_hat = y_hat.cuda()

if types == "f":
            y_hat = self.forward_f(x)

elif types == "edge":
            y_hat = self.forward_edge(x)

return y_hat

class ConvEdge(Edge):
    def __init__(self, stride):
        super().__init__()
        self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1)

# Kaiming He weight Initialization
        # https://medium.com/@shoray.goel/kaiming-he-initialization-a8d9ed0b5899
        nn.init.kaiming_uniform_(self.f.weight, mode='fan_in', nonlinearity='relu')

# class LinearEdge(Edge):
#    def __init__(self):
#        super().__init__()
#        self.f = nn.Linear(84, 10)

class MaxPoolEdge(Edge):
    def __init__(self, stride):
        super().__init__()
        self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)

class AvgPoolEdge(Edge):
    def __init__(self, stride):
        super().__init__()
        self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)

class Skip(nn.Module):
    def forward(self, x):
        return x

class SkipEdge(Edge):
    def __init__(self):
        super().__init__()
        self.f = Skip()

# to collect and manage different edges between 2 nodes
class Connection(nn.Module):
    def __init__(self, stride):
        super(Connection, self).__init__()

if USE_CUDA:
            # creates distinct edges and references each of them in a list (self.edges)
            # self.linear_edge = LinearEdge().cuda()
            self.conv2d_edge = ConvEdge(stride).cuda()
            self.maxpool_edge = MaxPoolEdge(stride).cuda()
            self.avgpool_edge = AvgPoolEdge(stride).cuda()
            self.skip_edge = SkipEdge().cuda()

else:
            # creates distinct edges and references each of them in a list (self.edges)
            # self.linear_edge = LinearEdge()
            self.conv2d_edge = ConvEdge(stride)
            self.maxpool_edge = MaxPoolEdge(stride)
            self.avgpool_edge = AvgPoolEdge(stride)
            self.skip_edge = SkipEdge()

# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]
        # python list will break the computation graph, need to use nn.ModuleList as a differentiable python list
        self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge])
        self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
        # self.edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
        #                                  requires_grad=False)

# use linear transformation (weighted summation) to combine results from different edges
        self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                                requires_grad=False)
        self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                             requires_grad=False)

if USE_CUDA:
            self.combined_feature_map = self.combined_feature_map.cuda()
            self.combined_edge_map = self.combined_edge_map.cuda()

for e in range(NUM_OF_MIXED_OPS):
            with torch.no_grad():
                self.edge_weights[e] = self.edges[e].weights

# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions
            # self.f_weights[e] = list(self.edges[e].parameters())

def reinit(self):
        self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                                requires_grad=False)
        if USE_CUDA:
            self.combined_feature_map = self.combined_feature_map.cuda()

# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
    # Tensorboard visualization requires a generic forward() function
    def forward(self, x, types=None):
        edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                    requires_grad=False)
        if USE_CUDA:
            edges_results = edges_results.cuda()

for e in range(NUM_OF_MIXED_OPS):
	    with torch.no_grad():
            	edges_results = edges_results + self.edges[e].forward(x, types)

return edges_results * DECAY_FACTOR

# to collect and manage multiple different connections between a particular node and its neighbouring nodes
class Node(nn.Module):
    def __init__(self, stride):
        super(Node, self).__init__()

# two types of output connections
        # Type 1: (multiple edges) output connects to the input of the other intermediate nodes
        # Type 2: (single edge) output connects directly to the final output node

# Type 1
        self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)])

# Type 2
        # depends on PREVIOUS node's Type 1 output
        self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                  requires_grad=False)  # for initialization

if USE_CUDA:
            self.output = self.output.cuda()

def reinit(self):
        self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                  requires_grad=False)
        if USE_CUDA:
            self.output = self.output.cuda()

# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
    # Tensorboard visualization requires a generic forward() function
    def forward(self, x, node_num=0, types=None):
        value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                            requires_grad=False)

# not all nodes have same number of Type-1 output connection
        for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - node_num - 1):
            y = self.connections[cc].forward(x, types)

# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.
            # Tensorflow prefers the use of a new intermediate variable instead of self.variable
            value = self.connections[cc].combined_feature_map

# combines all the feature maps from different mixed ops edges
            value = value + y  # Ltrain(w±, alpha)

# stores the addition result for next for loop index
            self.connections[cc].combined_feature_map = value

decayed_value = value * DECAY_FACTOR

if USE_CUDA:
	    decayed_value = decayed_value.cuda()

return decayed_value

# to manage all nodes within a cell
class Cell(nn.Module):
    def __init__(self, stride):
        super(Cell, self).__init__()

# all the coloured edges inside
        # https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png
        # A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes
        # for the k-th node, we have (k+1) preceding nodes.
        # Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'),
        # is connected to each previous intermediate state
        # as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer).
        # previous_previous_cell_output = c_{k-2}
        # previous_cell_output = c{k-1}
        self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)])

# just for variables initialization
        self.previous_cell = 0
        self.previous_previous_cell = 0
        self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                                  requires_grad=False)

if USE_CUDA:
            self.output = self.output.cuda()

value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
                            requires_grad=False)

for n in range(NUM_OF_NODES_IN_EACH_CELL):
            if c <= 1:
                if n == 0:
                    # Uses datasets as input
                    # x = train_inputs

if USE_CUDA:
                        x = x.cuda()

# combines all the feature maps from different mixed ops edges
                    self.nodes[n].output = \
                        self.nodes[n].forward(x, node_num=n, types=types)  # Ltrain(w±, alpha)

else:
                    # Uses feature map output from previous neighbour nodes for further processing
                    for ni in range(n):
                        # nodes[ni] for previous nodes only
                        # connections[n-ni-1] for neighbour nodes only
                        x = self.nodes[ni].connections[n-ni-1].combined_feature_map

# combines all the feature maps from different mixed ops edges
                        self.nodes[n].output = self.nodes[n].output + \
                            self.nodes[n].forward(x, node_num=n, types=types)  # Ltrain(w±, alpha)

else:
                if n == 0:
                    # Uses feature map output from previous neighbour cells for further processing
                    self.nodes[n].output = \
                        self.nodes[n].forward(x1, node_num=n, types=types) + \
                        self.nodes[n].forward(x2, node_num=n, types=types)  # Ltrain(w±, alpha)

# Uses feature map output from previous neighbour cells for further processing
                    self.nodes[n].output = self.nodes[n].output + \
                        self.nodes[n].forward(x1, node_num=n, types=types) + \
                        self.nodes[n].forward(x2, node_num=n, types=types)  # Ltrain(w±, alpha)

# 'add' then 'concat' feature maps from different nodes
            # needs to take care of tensor dimension mismatch
            # See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
            # self.output = self.output + self.nodes[n].output

if USE_CUDA:
                self.nodes[n].output = self.nodes[n].output.cuda()
                value = value.cuda()

value = value + self.nodes[n].output
            self.output = value

# to manage all nodes
class Graph(nn.Module):
    def __init__(self):
        super(Graph, self).__init__()

stride = 1  # just to initialize a variable

# for i in range(NUM_OF_CELLS):
        #    if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0:
        #        stride = REDUCTION_STRIDE  # to emulate reduction cell by using normal cell with stride=2
        #    else:
        #        stride = NORMAL_STRIDE  # normal cell

self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)])

self.linears = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)

self.softmax = nn.Softmax(1)

def reinit(self):
        # See https://discuss.pytorch.org/t/tensorboard-issue-with-self-defined-forward-function/140628/20?u=promach
        for c in range(NUM_OF_CELLS):
            self.cells[c].reinit()

for n in range(NUM_OF_NODES_IN_EACH_CELL):
                self.cells[c].nodes[n].reinit()

# not all nodes have same number of Type-1 output connection
                for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                    self.cells[c].nodes[n].connections[cc].reinit()

def print_debug(self):
        for c in range(NUM_OF_CELLS):
            for n in range(NUM_OF_NODES_IN_EACH_CELL):
                # not all nodes have same number of Type-1 output connection
                for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                    for e in range(NUM_OF_MIXED_OPS):

if DEBUG:
                            print("c = ", c, " , n = ", n, " , cc = ", cc, " , e = ", e)

print("graph.cells[", c, "].nodes[", n, "].connections[", cc,
                                  "].combined_feature_map.grad_fn = ",
                                  self.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn)

print("graph.cells[", c, "].output.grad_fn = ",
                                  self.cells[c].output.grad_fn)

print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ",
                                  self.cells[c].nodes[n].output.grad_fn)

if VISUALIZER == 0:
                                self.cells[c].nodes[n].output.retain_grad()
                                print("gradwalk(graph.cells[", c, "].nodes[", n, "].output.grad_fn)")
                                # gradwalk(graph.cells[c].nodes[n].output.grad_fn)

if DEBUG:
                            print("graph.cells[", c, "].output.grad_fn = ",
                                  self.cells[c].output.grad_fn)

if VISUALIZER == 0:
                                self.cells[c].output.retain_grad()
                                print("gradwalk(graph.cells[", c, "].output.grad_fn)")
                                # gradwalk(graph.cells[c].output.grad_fn)

# train_inputs = x

# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/
        # https://docs.python.org/3/tutorial/datastructures.html

# generates a supernet consisting of 'NUM_OF_CELLS' cells
        # each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes
        # refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement
        # https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html

# encodes the cells and nodes arrangement in the multigraph

outputs1 = 0  # just for initialization, no special meaning

for c in range(NUM_OF_CELLS):
            x1 = self.cells[c - 1].output
            x2 = self.cells[c - PREVIOUS_PREVIOUS].output

self.cells[c].forward(x, x1, x2, c, types=types)

output_tensor = self.cells[NUM_OF_CELLS - 1].output
        output_tensor = output_tensor.view(output_tensor.shape[0], -1)

if USE_CUDA:
            output_tensor = output_tensor.cuda()

if DEBUG and VISUALIZER == 0:
            print("gradwalk(output_tensor.grad_fn)")
            # gradwalk(output_tensor.grad_fn)

if USE_CUDA:
            outputs1 = self.linears(output_tensor).cuda()

else:
            outputs1 = self.linears(output_tensor)
            
	outputs1 = self.softmax(outputs1)

if USE_CUDA:
            outputs1 = outputs1.cuda()

return outputs1

total_grad_out = []
total_grad_in = []

def hook_fn_backward(module, grad_input, grad_output):
    print(module)  # for distinguishing module

# In order to comply with the order back-propagation, let's print grad_output
    print('grad_output', grad_output)

# Reprint grad_input
    print('grad_input', grad_input)

# Save to global variables
    total_grad_in.append(grad_input)
    total_grad_out.append(grad_output)

# for tracking the gradient back-propagation operation
def gradwalk(x, _depth=0):
    if hasattr(x, 'grad'):
        x = x.grad

if hasattr(x, 'next_functions'):
        for fn in x.next_functions:
            print(' ' * _depth + str(fn))
            gradwalk(fn[0], _depth + 1)

# Function to Convert to ONNX
def Convert_ONNX(model, model_input):

# Export the model
    torch.onnx.export(model,         # model being run
                      model_input,       # model input (or a tuple for multiple inputs)
                      "gdas.onnx",       # where to save the model
                      export_params=True,  # store the trained parameter weights inside the model file
                      opset_version=10,    # the ONNX version to export the model to
                      do_constant_folding=True,  # whether to execute constant folding for optimization
                      input_names = ['modelInput'],   # the model's input names
                      output_names = ['modelOutput'],  # the model's output names
                      dynamic_axes={'modelInput': {0: 'batch_size'},    # variable length axes
                                    'modelOutput': {0: 'batch_size'}})
    print(" ")
    print('Model has been converted to ONNX')

# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html
def train_NN(forward_pass_only):
    if DEBUG:
        print("Entering train_NN(), forward_pass_only = ", forward_pass_only)

graph = Graph()

if USE_CUDA:
        graph = graph.cuda()

if DEBUG:
        modules = graph.named_children()
        print("modules = ", modules)

if VISUALIZER == 0:
            # Tensorboard does not like backward hook
            for name, module in graph.named_modules():
                module.register_full_backward_hook(hook_fn_backward)

criterion = nn.CrossEntropyLoss()
    # criterion = nn.BCELoss()
    optimizer1 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

# just for initialization, no special meaning
    Ltrain = 0
    NN_input = 0
    NN_output = torch.tensor(0)
    NN_train_labels = 0

for train_data, val_data in (zip(trainloader, valloader)):

NN_input, NN_train_labels = train_data
        # val_inputs, val_labels = val_data

if USE_CUDA:
            NN_input = NN_input.cuda()
            NN_train_labels = NN_train_labels.cuda()

# normalize inputs
        NN_input = NN_input / 255

if forward_pass_only == 0:
            # zero the parameter gradients
            optimizer1.zero_grad()

#  do train thing for internal NN function weights
            NN_output = graph.forward(NN_input, types="f")

if VISUALIZER:
            # netron https://docs.microsoft.com/zh-cn/windows/ai/windows-ml/tutorials/pytorch-convert-model
            Convert_ONNX(graph, NN_input)

# tensorboard
            writer.add_graph(graph, NN_input)
            writer.close()

# graphviz
            make_dot(NN_output.mean(), params=dict(graph.named_parameters())).render("gdas_torchviz", format="svg")

if DEBUG:
            print("outputs1.size() = ", NN_output.size())
            print("train_labels.size() = ", NN_train_labels.size())

Ltrain = criterion(NN_output, NN_train_labels)
        Ltrain = Ltrain.requires_grad_()
        Ltrain.retain_grad()

if forward_pass_only == 0:
            # backward pass
            if DEBUG:
                Ltrain.register_hook(lambda x: print(x))

Ltrain.backward(retain_graph=True)

if DEBUG:
                print("starts to print graph.named_parameters()")

for name, param in graph.named_parameters():
                    print(name, param.grad)

print("finished printing graph.named_parameters()")

print("starts gradwalk()")

# gradwalk(Ltrain.grad_fn)

print("finished gradwalk()")

optimizer1.step()
            # graph.reinit()

else:
            # graph.reinit()

# no need to save model parameters for next epoch
            return Ltrain

# DARTS's approximate architecture gradient. Refer to equation (8)
    # needs to save intermediate trained model for Ltrain
    path = './model.pth'
    torch.save(graph, path)

if DEBUG:
        print("after multiple for-loops")

return Ltrain

def train_architecture(forward_pass_only, train_or_val='val'):
    if DEBUG:
        print("Entering train_architecture(), forward_pass_only = ", forward_pass_only, " , train_or_val = ",
              train_or_val)

graph = Graph()

if USE_CUDA:
        graph = graph.cuda()

criterion = nn.CrossEntropyLoss()
    optimizer2 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

# just for initialization, no special meaning
    Lval = 0
    train_inputs = 0
    train_labels = 0
    val_inputs = 0
    val_labels = 0

if forward_pass_only == 0:
        #  do train thing for architecture edge weights
        graph.train()

# zero the parameter gradients
        optimizer2.zero_grad()

if DEBUG:
        print("before multiple for-loops")

for train_data, val_data in (zip(trainloader, valloader)):

train_inputs, train_labels = train_data
        val_inputs, val_labels = val_data

if USE_CUDA:
            train_inputs = train_inputs.cuda()
            train_labels = train_labels.cuda()
            val_inputs = val_inputs.cuda()
            val_labels = val_labels.cuda()

# normalize inputs
        train_inputs = train_inputs / 255
        val_inputs = val_inputs / 255

# forward pass
        # use linear transformation ('weighted sum then concat') to combine results from different nodes
        # into an output feature map to be fed into the next neighbour node for further processing
        for c in range(NUM_OF_CELLS):
            for n in range(NUM_OF_NODES_IN_EACH_CELL):
                # not all nodes have same number of Type-1 output connection
                for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                    for e in range(NUM_OF_MIXED_OPS):

if c == 0:
                            if train_or_val == 'val':
                                x = val_inputs

else:
                                x = train_inputs

else:
                            # Uses feature map output from previous neighbour node for further processing
                            x = graph.cells[c].nodes[n - 1].connections[cc].combined_edge_map

if USE_CUDA:
                            x = x.cuda()

# need to take care of tensors dimension mismatch
                        graph.cells[c].nodes[n].connections[cc].combined_edge_map = \
                            graph.cells[c].nodes[n].connections[cc].combined_edge_map + \
                            graph.cells[c].nodes[n].connections[cc].edges[e].forward(x, "edge")  # Lval(w*, alpha)

output2_tensor = graph.cells[NUM_OF_CELLS - 1].output
        output2_tensor = output2_tensor.view(output2_tensor.shape[0], -1)

if USE_CUDA:
            output2_tensor = output2_tensor.cuda()

if USE_CUDA:
            m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES).cuda()

else:
            m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)

outputs2 = m_linear(output2_tensor)

if USE_CUDA:
            outputs2 = outputs2.cuda()

if DEBUG:
            print("outputs2.size() = ", outputs2.size())
            print("val_labels.size() = ", val_labels.size())
            print("train_labels.size() = ", train_labels.size())

if train_or_val == 'val':
            Lval = criterion(outputs2, val_labels)

else:
            Lval = criterion(outputs2, train_labels)

Lval = Lval.requires_grad_()
        Lval.retain_grad()

if forward_pass_only == 0:
            # backward pass
            Lval.backward(retain_graph=True)

if DEBUG:
                for name, param in graph.named_parameters():
                    print(name, param.grad)

optimizer2.step()

else:
            # no need to save model parameters for next epoch
            return Lval

# needs to save intermediate trained model for Lval
    path = './model.pth'
    torch.save(graph, path)

# DARTS's approximate architecture gradient. Refer to equation (8) and https://i.imgur.com/81JFaWc.png
    sigma = LEARNING_RATE
    epsilon = 0.01 / torch.norm(Lval)

for c in range(NUM_OF_CELLS):
        for n in range(NUM_OF_NODES_IN_EACH_CELL):
            # not all nodes have same number of Type-1 output connection
            for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                for e in range(NUM_OF_MIXED_OPS):
                    EE = graph.cells[c].nodes[n].connections[cc].edges[e]

for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
                        # https://mythrex.github.io/math_behind_darts/
                        # Finite Difference Method
                        EE.weight_plus = w + epsilon * Lval
                        EE.weight_minus = w - epsilon * Lval

# Backups original f_weights
                        EE.f_weights_backup = w

# replaces f_weights with weight_plus before NN training
    for c in range(NUM_OF_CELLS):
        for n in range(NUM_OF_NODES_IN_EACH_CELL):
            # not all nodes have same number of Type-1 output connection
            for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                for e in range(NUM_OF_MIXED_OPS):
                    EE = graph.cells[c].nodes[n].connections[cc].edges[e]

for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
                        w = EE.weight_plus

# test NN to obtain loss
    Ltrain_plus = train_architecture(forward_pass_only=1, train_or_val='train')

# replaces f_weights with weight_minus before NN training
    for c in range(NUM_OF_CELLS):
        for n in range(NUM_OF_NODES_IN_EACH_CELL):
            # not all nodes have same number of Type-1 output connection
            for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                for e in range(NUM_OF_MIXED_OPS):
                    EE = graph.cells[c].nodes[n].connections[cc].edges[e]

for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
                        w = EE.weight_minus

# test NN to obtain loss
    Ltrain_minus = train_architecture(forward_pass_only=1, train_or_val='train')

# Restores original f_weights
    for c in range(NUM_OF_CELLS):
        for n in range(NUM_OF_NODES_IN_EACH_CELL):
            # not all nodes have same number of Type-1 output connection
            for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
                for e in range(NUM_OF_MIXED_OPS):
                    EE = graph.cells[c].nodes[n].connections[cc].edges[e]

for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
                        w = EE.f_weights_backup

if DEBUG:
        print("after multiple for-loops")

L2train_Lval = (Ltrain_plus - Ltrain_minus) / (2 * epsilon)

return Lval - L2train_Lval

if __name__ == "__main__":
    run_num = 0
    not_converged = 1

while not_converged:
        print("run_num = ", run_num)

ltrain = train_NN(forward_pass_only=0)
        print("Finished train_NN()")

if VISUALIZER or DEBUG:
            if run_num > 1:
                break  # visualizer does not need more than a single run

# 'train_or_val' to differentiate between using training dataset and validation dataset
        lval = train_architecture(forward_pass_only=0, train_or_val='val')
        print("Finished train_architecture()")

print("lval = ", lval, " , ltrain = ", ltrain)
        not_converged = (lval > 0.01) or (ltrain > 0.01)

run_num = run_num + 1

#  do test thing

Changed text