gdas_diff_14_09_2021

Created Diff never expires
22 removals
Lines
Total
Removed
Words
Total
Removed
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
624 lines
29 additions
Lines
Total
Added
Words
Total
Added
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
629 lines
# https://github.com/D-X-Y/AutoDL-Projects/issues/99
# https://github.com/D-X-Y/AutoDL-Projects/issues/99


import torch
import torch
import torch.utils.data
import torch.utils.data
import torch.nn as nn
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.functional as F
import torch.optim as optim
import torch.optim as optim


import torchvision
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms as transforms


# import numpy as np
# import numpy as np
torch.autograd.set_detect_anomaly(True)
torch.autograd.set_detect_anomaly(True)


USE_CUDA = torch.cuda.is_available()
USE_CUDA = torch.cuda.is_available()


# https://arxiv.org/pdf/1806.09055.pdf#page=12
# https://arxiv.org/pdf/1806.09055.pdf#page=12
TEST_DATASET_RATIO = 0.5 # 50 percent of the dataset is dedicated for testing purpose
TEST_DATASET_RATIO = 0.5 # 50 percent of the dataset is dedicated for testing purpose
BATCH_SIZE = 4
BATCH_SIZE = 4
NUM_OF_IMAGE_CHANNELS = 3 # RGB
NUM_OF_IMAGE_CHANNELS = 3 # RGB
IMAGE_HEIGHT = 32
IMAGE_HEIGHT = 32
IMAGE_WIDTH = 32
IMAGE_WIDTH = 32
NUM_OF_IMAGE_CLASSES = 10
NUM_OF_IMAGE_CLASSES = 10


SIZE_OF_HIDDEN_LAYERS = 64
SIZE_OF_HIDDEN_LAYERS = 64
NUM_EPOCHS = 1
NUM_EPOCHS = 1
LEARNING_RATE = 0.025
LEARNING_RATE = 0.025
MOMENTUM = 0.9
MOMENTUM = 0.9
NUM_OF_CELLS = 8
NUM_OF_CELLS = 8
NUM_OF_MIXED_OPS = 4
NUM_OF_MIXED_OPS = 4
NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2 # last_cell_output , second_last_cell_output
NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2 # last_cell_output , second_last_cell_output
NUM_OF_NODES_IN_EACH_CELL = 4
NUM_OF_NODES_IN_EACH_CELL = 4
MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL
MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL
NUM_OF_CHANNELS = 16
NUM_OF_CHANNELS = 16
INTERVAL_BETWEEN_REDUCTION_CELLS = 3
INTERVAL_BETWEEN_REDUCTION_CELLS = 3
PREVIOUS_PREVIOUS = 2 # (n-2)
PREVIOUS_PREVIOUS = 2 # (n-2)
REDUCTION_STRIDE = 2
REDUCTION_STRIDE = 2
NORMAL_STRIDE = 1
NORMAL_STRIDE = 1
TAU_GUMBEL = 0.5
TAU_GUMBEL = 0.5
EDGE_WEIGHTS_NETWORK_IN_SIZE = 5
EDGE_WEIGHTS_NETWORK_IN_SIZE = 5
EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2
EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2


# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
transform = transforms.Compose(
transform = transforms.Compose(
[transforms.ToTensor(),
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
shuffle=True, num_workers=2)
shuffle=True, num_workers=2)


valset = torchvision.datasets.CIFAR10(root='./data', train=False,
valset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
download=True, transform=transform)
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
shuffle=False, num_workers=2)
shuffle=False, num_workers=2)


classes = ('plane', 'car', 'bird', 'cat',
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO))
TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO))




# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433
# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433
# for training for edge weights as well as internal NN function weights
# for training for edge weights as well as internal NN function weights
class Edge(nn.Module):
class Edge(nn.Module):


def __init__(self):
def __init__(self):
super(Edge, self).__init__()
super(Edge, self).__init__()


# https://stackoverflow.com/a/51027227/8776167
# https://stackoverflow.com/a/51027227/8776167
# self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE)
# self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE)
# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html
# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html
self.weights = nn.Parameter(torch.zeros(1),
# for edge weights, not for internal NN function weights
requires_grad=True) # for edge weights, not for internal NN function weights
if USE_CUDA:
self.weights = nn.Parameter(torch.zeros(1, device="cuda"))

else:
self.weights = nn.Parameter(torch.zeros(1))


def __freeze_w(self):
def __freeze_w(self):
self.weights.requires_grad = False
self.weights.requires_grad = False


def __unfreeze_w(self):
def __unfreeze_w(self):
self.weights.requires_grad = True
self.weights.requires_grad = True


def __freeze_f(self):
def __freeze_f(self):
for param in self.f.parameters():
for param in self.f.parameters():
param.requires_grad = False
param.requires_grad = False


def __unfreeze_f(self):
def __unfreeze_f(self):
for param in self.f.parameters():
for param in self.f.parameters():
param.requires_grad = True
param.requires_grad = True


# for NN functions internal weights training
# for NN functions internal weights training
def forward_f(self, x):
def forward_f(self, x):
self.__unfreeze_f()
self.__unfreeze_f()
self.__freeze_w()
self.__freeze_w()


# inheritance in python classes and SOLID principles
# inheritance in python classes and SOLID principles
# https://en.wikipedia.org/wiki/SOLID
# https://en.wikipedia.org/wiki/SOLID
# https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html
# https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html
return self.f(x)
return self.f(x)


# self-defined initial NAS architecture, for supernet architecture edge weight training
# self-defined initial NAS architecture, for supernet architecture edge weight training
def forward_edge(self, x):
def forward_edge(self, x):
self.__freeze_f()
self.__freeze_f()
self.__unfreeze_w()
self.__unfreeze_w()


return x * self.weights
return x * self.weights




class ConvEdge(Edge):
class ConvEdge(Edge):
def __init__(self, stride):
def __init__(self, stride):
super().__init__()
super().__init__()
self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1)
self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1)




class LinearEdge(Edge):
class LinearEdge(Edge):
def __init__(self):
def __init__(self):
super().__init__()
super().__init__()
self.f = nn.Linear(84, 10)
self.f = nn.Linear(84, 10)




class MaxPoolEdge(Edge):
class MaxPoolEdge(Edge):
def __init__(self, stride):
def __init__(self, stride):
super().__init__()
super().__init__()
self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)
self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)




class AvgPoolEdge(Edge):
class AvgPoolEdge(Edge):
def __init__(self, stride):
def __init__(self, stride):
super().__init__()
super().__init__()
self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)
self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)




class Skip(nn.Module):
class Skip(nn.Module):
def forward(self, x):
def forward(self, x):
return x
return x




class SkipEdge(Edge):
class SkipEdge(Edge):
def __init__(self):
def __init__(self):
super().__init__()
super().__init__()
self.f = Skip()
self.f = Skip()




# to collect and manage different edges between 2 nodes
# to collect and manage different edges between 2 nodes
class Connection(nn.Module):
class Connection(nn.Module):
def __init__(self, stride):
def __init__(self, stride):
super(Connection, self).__init__()
super(Connection, self).__init__()


if USE_CUDA:
if USE_CUDA:
# creates distinct edges and references each of them in a list (self.edges)
# creates distinct edges and references each of them in a list (self.edges)
# self.linear_edge = LinearEdge().cuda()
# self.linear_edge = LinearEdge().cuda()
self.conv2d_edge = ConvEdge(stride).cuda()
self.conv2d_edge = ConvEdge(stride).cuda()
self.maxpool_edge = MaxPoolEdge(stride).cuda()
self.maxpool_edge = MaxPoolEdge(stride).cuda()
self.avgpool_edge = AvgPoolEdge(stride).cuda()
self.avgpool_edge = AvgPoolEdge(stride).cuda()
self.skip_edge = SkipEdge().cuda()
self.skip_edge = SkipEdge().cuda()


else:
else:
# creates distinct edges and references each of them in a list (self.edges)
# creates distinct edges and references each of them in a list (self.edges)
# self.linear_edge = LinearEdge()
# self.linear_edge = LinearEdge()
self.conv2d_edge = ConvEdge(stride)
self.conv2d_edge = ConvEdge(stride)
self.maxpool_edge = MaxPoolEdge(stride)
self.maxpool_edge = MaxPoolEdge(stride)
self.avgpool_edge = AvgPoolEdge(stride)
self.avgpool_edge = AvgPoolEdge(stride)
self.skip_edge = SkipEdge()
self.skip_edge = SkipEdge()


self.conv2d_edge = ConvEdge(stride).requires_grad_()
self.maxpool_edge = MaxPoolEdge(stride).requires_grad_()
self.avgpool_edge = AvgPoolEdge(stride).requires_grad_()
self.skip_edge = SkipEdge().requires_grad_()

# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]
# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]
# python list will break the computation graph, need to use nn.ModuleList as a differentiable python list
# python list will break the computation graph, need to use nn.ModuleList as a differentiable python list
self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge])
self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge])
self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)


# for approximate architecture gradient
# for approximate architecture gradient
self.f_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.f_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.f_weights_backup = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.f_weights_backup = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.weight_plus = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.weight_plus = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.weight_minus = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.weight_minus = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)


# use linear transformation (weighted summation) to combine results from different edges
# use linear transformation (weighted summation) to combine results from different edges
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH])
requires_grad=True)


if USE_CUDA:
if USE_CUDA:
self.combined_feature_map = self.combined_feature_map.cuda()
self.combined_feature_map = self.combined_feature_map.cuda()


self.combined_feature_map.requires_grad_()

for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
with torch.no_grad():
with torch.no_grad():
self.edge_weights[e] = self.edges[e].weights
self.edge_weights[e] = self.edges[e].weights


# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions
# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions
# self.f_weights[e] = list(self.edges[e].parameters())
# self.f_weights[e] = list(self.edges[e].parameters())


# Refer to GDAS equations (5) and (6)
# Refer to GDAS equations (5) and (6)
# if one_hot is already there, would summation be required given that all other entries are forced to 0 ?
# if one_hot is already there, would summation be required given that all other entries are forced to 0 ?
# It's not required, but you don't know, which index is one hot encoded 1.
# It's not required, but you don't know, which index is one hot encoded 1.
# https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax
# https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax
# See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163
# See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163


gumbel = F.gumbel_softmax(self.edge_weights, tau=TAU_GUMBEL, hard=True)
gumbel = F.gumbel_softmax(self.edge_weights, tau=TAU_GUMBEL, hard=True)
self.chosen_edge = torch.argmax(gumbel, dim=0) # converts one-hot encoding into integer
self.chosen_edge = torch.argmax(gumbel, dim=0) # converts one-hot encoding into integer




# to collect and manage multiple different connections between a particular node and its neighbouring nodes
# to collect and manage multiple different connections between a particular node and its neighbouring nodes
class Node(nn.Module):
class Node(nn.Module):
def __init__(self, stride):
def __init__(self, stride):
super(Node, self).__init__()
super(Node, self).__init__()


# two types of output connections
# two types of output connections
# Type 1: (multiple edges) output connects to the input of the other intermediate nodes
# Type 1: (multiple edges) output connects to the input of the other intermediate nodes
# Type 2: (single edge) output connects directly to the final output node
# Type 2: (single edge) output connects directly to the final output node


# Type 1
# Type 1
self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)])
self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)])


# Type 2
# Type 2
# depends on PREVIOUS node's Type 1 output
# depends on PREVIOUS node's Type 1 output
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH]) # for initialization
requires_grad=True) # for initialization


if USE_CUDA:
if USE_CUDA:
self.output = self.output.cuda()
self.output = self.output.cuda()


self.output = self.output.requires_grad_()



# to manage all nodes within a cell
# to manage all nodes within a cell
class Cell(nn.Module):
class Cell(nn.Module):
def __init__(self, stride):
def __init__(self, stride):
super(Cell, self).__init__()
super(Cell, self).__init__()


# all the coloured edges inside
# all the coloured edges inside
# https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png
# https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png
# A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes
# A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes
# for the k-th node, we have (k+1) preceding nodes.
# for the k-th node, we have (k+1) preceding nodes.
# Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'),
# Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'),
# is connected to each previous intermediate state
# is connected to each previous intermediate state
# as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer).
# as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer).
# previous_previous_cell_output = c_{k-2}
# previous_previous_cell_output = c_{k-2}
# previous_cell_output = c{k-1}
# previous_cell_output = c{k-1}
self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)])
self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)])


# just for variables initialization
# just for variables initialization
self.previous_cell = 0
self.previous_cell = 0
self.previous_previous_cell = 0
self.previous_previous_cell = 0
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH])
requires_grad=True)


if USE_CUDA:
if USE_CUDA:
self.output = self.output.cuda()
self.output = self.output.cuda()


self.output = self.output.requires_grad_()

for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# 'add' then 'concat' feature maps from different nodes
# 'add' then 'concat' feature maps from different nodes
# needs to take care of tensor dimension mismatch
# needs to take care of tensor dimension mismatch
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
self.output = self.output + self.nodes[n].output
self.output = self.output + self.nodes[n].output




# to manage all nodes
# to manage all nodes
class Graph(nn.Module):
class Graph(nn.Module):
def __init__(self):
def __init__(self):
super(Graph, self).__init__()
super(Graph, self).__init__()


stride = 0 # just to initialize a variable
stride = 0 # just to initialize a variable


for i in range(NUM_OF_CELLS):
for i in range(NUM_OF_CELLS):
if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0:
if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0:
stride = REDUCTION_STRIDE # to emulate reduction cell by using normal cell with stride=2
stride = REDUCTION_STRIDE # to emulate reduction cell by using normal cell with stride=2
else:
else:
stride = NORMAL_STRIDE # normal cell
stride = NORMAL_STRIDE # normal cell


self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)])
self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)])




total_grad_out = []
total_grad_out = []
total_grad_in = []
total_grad_in = []


def hook_fn_backward (module, grad_input, grad_output):
def hook_fn_backward (module, grad_input, grad_output):
print (module) # for distinguishing module
print (module) # for distinguishing module


# In order to comply with the order back-propagation, let's print grad_output
# In order to comply with the order back-propagation, let's print grad_output
print ( 'grad_output', grad_output)
print ( 'grad_output', grad_output)


# Reprint grad_input
# Reprint grad_input
print ( 'grad_input', grad_input)
print ( 'grad_input', grad_input)


# Save to global variables
# Save to global variables
total_grad_in.append (grad_input)
total_grad_in.append (grad_input)
total_grad_out.append (grad_output)
total_grad_out.append (grad_output)




# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html
# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html
def train_NN(forward_pass_only):
def train_NN(forward_pass_only):
print("Entering train_NN(), forward_pass_only = ", forward_pass_only)
print("Entering train_NN(), forward_pass_only = ", forward_pass_only)


graph = Graph()
graph = Graph()


if USE_CUDA:
if USE_CUDA:
graph = graph.cuda()
graph = graph.cuda()


modules = graph.named_children()
modules = graph.named_children()
print("modules = " , modules)
print("modules = " , modules)


for name, module in graph.named_modules():
for name, module in graph.named_modules():
module.register_full_backward_hook(hook_fn_backward)
module.register_full_backward_hook(hook_fn_backward)


criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss()
# criterion = nn.BCELoss()
# criterion = nn.BCELoss()
optimizer1 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
optimizer1 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)


# just for initialization, no special meaning
# just for initialization, no special meaning
Ltrain = 0
Ltrain = 0
train_inputs = 0
train_inputs = 0
train_labels = 0
train_labels = 0


if forward_pass_only == 0:
if forward_pass_only == 0:
# do train thing for architecture edge weights
# do train thing for architecture edge weights
graph.train()
graph.train()


# zero the parameter gradients
# zero the parameter gradients
optimizer1.zero_grad()
optimizer1.zero_grad()


print("before multiple for-loops")
print("before multiple for-loops")


for train_data, val_data in (zip(trainloader, valloader)):
for train_data, val_data in (zip(trainloader, valloader)):


train_inputs, train_labels = train_data
train_inputs, train_labels = train_data
# val_inputs, val_labels = val_data
# val_inputs, val_labels = val_data


if USE_CUDA:
if USE_CUDA:
train_inputs = train_inputs.cuda()
train_inputs = train_inputs.cuda()
train_labels = train_labels.cuda()
train_labels = train_labels.cuda()


for epoch in range(NUM_EPOCHS):
for epoch in range(NUM_EPOCHS):
# forward pass
# forward pass
for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
if c == 0:
if c == 0:
x = train_inputs
x = train_inputs


if USE_CUDA:
if USE_CUDA:
x = x.cuda()
x = x.cuda()


else:
else:
if n == 0:
if n == 0:
# Uses feature map output from previous neighbour cell for further processing
# Uses feature map output from previous neighbour cell for further processing
x = graph.cells[c-1].nodes[NUM_OF_NODES_IN_EACH_CELL-1].connections[cc].combined_feature_map
x = graph.cells[c-1].nodes[NUM_OF_NODES_IN_EACH_CELL-1].connections[cc].combined_feature_map


else:
else:
# Uses feature map output from previous neighbour node for further processing
# Uses feature map output from previous neighbour node for further processing
x = graph.cells[c].nodes[n-1].connections[cc].combined_feature_map
x = graph.cells[c].nodes[n-1].connections[cc].combined_feature_map


# combines all the feature maps from different mixed ops edges
# combines all the feature maps from different mixed ops edges
graph.cells[c].nodes[n].connections[cc].combined_feature_map = \
graph.cells[c].nodes[n].connections[cc].combined_feature_map = \
graph.cells[c].nodes[n].connections[cc].combined_feature_map + \
graph.cells[c].nodes[n].connections[cc].combined_feature_map + \
graph.cells[c].nodes[n].connections[cc].edges[e].forward_f(x) # Ltrain(w±, alpha)
graph.cells[c].nodes[n].connections[cc].edges[e].forward_f(x) # Ltrain(w±, alpha)


print("graph.cells[", c, "].nodes[", n, "].connections[", cc, "].combined_feature_map.grad_fn = ",
print("graph.cells[", c, "].nodes[", n, "].connections[", cc, "].combined_feature_map.grad_fn = ",
graph.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn)
graph.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn)


print("graph.cells[", c, "].nodes[", n, "].connections[", cc, "].edge_weights[", e, "].grad_fn = ",
print("graph.cells[", c, "].nodes[", n, "].connections[", cc, "].edge_weights[", e, "].grad_fn = ",
graph.cells[c].nodes[n].connections[cc].edge_weights[e].grad_fn)
graph.cells[c].nodes[n].connections[cc].edge_weights[e].grad_fn)


print("graph.cells[", c, "].output.grad_fn = ",
print("graph.cells[", c, "].output.grad_fn = ",
graph.cells[c].output.grad_fn)
graph.cells[c].output.grad_fn)


# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/
# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/
# https://docs.python.org/3/tutorial/datastructures.html
# https://docs.python.org/3/tutorial/datastructures.html


# generates a supernet consisting of 'NUM_OF_CELLS' cells
# generates a supernet consisting of 'NUM_OF_CELLS' cells
# each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes
# each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes
# refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement
# refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement
# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html


# encodes the cells and nodes arrangement in the multigraph
# encodes the cells and nodes arrangement in the multigraph


if c > 1: # for previous_previous_cell, (c-2)
if c > 1: # for previous_previous_cell, (c-2)
graph.cells[c].previous_cell = graph.cells[c - 1].output
graph.cells[c].previous_cell = graph.cells[c - 1].output
graph.cells[c].previous_previous_cell = graph.cells[c - PREVIOUS_PREVIOUS].output
graph.cells[c].previous_previous_cell = graph.cells[c - PREVIOUS_PREVIOUS].output


if n == 0:
if n == 0:
if c <= 1:
if c <= 1:
graph.cells[c].nodes[n].output = graph.cells[c].nodes[n].connections[cc].combined_feature_map
graph.cells[c].nodes[n].output = graph.cells[c].nodes[n].connections[cc].combined_feature_map


else: # there is no input from previous cells for the first two cells
else: # there is no input from previous cells for the first two cells
# needs to take care tensor dimension mismatch from multiple edges connections
# needs to take care tensor dimension mismatch from multiple edges connections
graph.cells[c].nodes[n].output += \
graph.cells[c].nodes[n].output = \
graph.cells[c].nodes[n].output + \
graph.cells[c-1].output + graph.cells[c-PREVIOUS_PREVIOUS].output
graph.cells[c-1].output + graph.cells[c-PREVIOUS_PREVIOUS].output


else: # n > 0
else: # n > 0
# depends on PREVIOUS node's Type 1 connection
# depends on PREVIOUS node's Type 1 connection
# needs to take care tensor dimension mismatch from multiple edges connections
# needs to take care tensor dimension mismatch from multiple edges connections
print("graph.cells[", c ,"].nodes[" ,n, "].output.size() = ",
print("graph.cells[", c ,"].nodes[" ,n, "].output.size() = ",
graph.cells[c].nodes[n].output.size())
graph.cells[c].nodes[n].output.size())


print("graph.cells[", c, "].nodes[", n-1, "].connections[", cc, "].combined_feature_map.size() = ",
print("graph.cells[", c, "].nodes[", n-1, "].connections[", cc, "].combined_feature_map.size() = ",
graph.cells[c].nodes[n-1].connections[cc].combined_feature_map.size())
graph.cells[c].nodes[n-1].connections[cc].combined_feature_map.size())


graph.cells[c].nodes[n].output += \
graph.cells[c].nodes[n].output = \
graph.cells[c].nodes[n].output + \
graph.cells[c].nodes[n-1].connections[cc].combined_feature_map + \
graph.cells[c].nodes[n-1].connections[cc].combined_feature_map + \
graph.cells[c - 1].output + \
graph.cells[c - 1].output + \
graph.cells[c - PREVIOUS_PREVIOUS].output
graph.cells[c - PREVIOUS_PREVIOUS].output


print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ",
print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ",
graph.cells[c].nodes[n].output.grad_fn)
graph.cells[c].nodes[n].output.grad_fn)


# 'add' then 'concat' feature maps from different nodes
# 'add' then 'concat' feature maps from different nodes
# needs to take care of tensor dimension mismatch
# needs to take care of tensor dimension mismatch
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
graph.cells[c].output += graph.cells[c].nodes[n].output
graph.cells[c].output += graph.cells[c].nodes[n].output


print("graph.cells[", c, "].output.grad_fn = ",
print("graph.cells[", c, "].output.grad_fn = ",
graph.cells[c].output.grad_fn)
graph.cells[c].output.grad_fn)


output_tensor = graph.cells[NUM_OF_CELLS-1].output
output_tensor = graph.cells[NUM_OF_CELLS-1].output
output_tensor = output_tensor.view(output_tensor.shape[0], -1)
output_tensor = output_tensor.view(output_tensor.shape[0], -1)


if USE_CUDA:
if USE_CUDA:
output_tensor = output_tensor.cuda()
output_tensor = output_tensor.cuda()


if USE_CUDA:
if USE_CUDA:
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES).cuda()
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES).cuda()


else:
else:
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)


outputs1 = m_linear(output_tensor)
outputs1 = m_linear(output_tensor)


if USE_CUDA:
if USE_CUDA:
outputs1 = outputs1.cuda()
outputs1 = outputs1.cuda()


print("outputs1.size() = ", outputs1.size())
print("outputs1.size() = ", outputs1.size())
print("train_labels.size() = ", train_labels.size())
print("train_labels.size() = ", train_labels.size())


Ltrain = criterion(outputs1, train_labels)
Ltrain = criterion(outputs1, train_labels)


if forward_pass_only == 0:
if forward_pass_only == 0:
# backward pass
# backward pass
Ltrain = Ltrain.requires_grad_()
Ltrain = Ltrain.requires_grad_()


Ltrain.retain_grad()
Ltrain.retain_grad()
Ltrain.register_hook(lambda x: print(x))
Ltrain.register_hook(lambda x: print(x))


Ltrain.backward()
Ltrain.backward()


# for c in range(NUM_OF_CELLS):
# for c in range(NUM_OF_CELLS):
# for n in range(NUM_OF_NODES_IN_EACH_CELL):
# for n in range(NUM_OF_NODES_IN_EACH_CELL):
# # not all nodes have same number of Type-1 output connection
# # not all nodes have same number of Type-1 output connection
# for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
# for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
# for e in range(NUM_OF_MIXED_OPS):
# for e in range(NUM_OF_MIXED_OPS):
# if e == 0: # graph.cells[c].nodes[n].connections[cc].edges[e] == conv2d_edge:
# if e == 0: # graph.cells[c].nodes[n].connections[cc].edges[e] == conv2d_edge:
# print("graph.cells[", c, "].nodes[", n, "].connections[", cc, "].edges[", e, "].f.weight.grad_fn = ",
# print("graph.cells[", c, "].nodes[", n, "].connections[", cc, "].edges[", e, "].f.weight.grad_fn = ",
# graph.cells[c].nodes[n].connections[cc].edges[e].f.weight.grad_fn)
# graph.cells[c].nodes[n].connections[cc].edges[e].f.weight.grad_fn)


print("starts to print graph.named_parameters()")
print("starts to print graph.named_parameters()")


for name, param in graph.named_parameters():
for name, param in graph.named_parameters():
print(name, param.grad)
print(name, param.grad)


print("finished printing graph.named_parameters()")
print("finished printing graph.named_parameters()")


optimizer1.step()
optimizer1.step()


else:
else:
# no need to save model parameters for next epoch
# no need to save model parameters for next epoch
return Ltrain
return Ltrain


# DARTS's approximate architecture gradient. Refer to equation (8)
# DARTS's approximate architecture gradient. Refer to equation (8)
# needs to save intermediate trained model for Ltrain
# needs to save intermediate trained model for Ltrain
path = './model.pth'
path = './model.pth'
torch.save(graph, path)
torch.save(graph, path)


print("after multiple for-loops")
print("after multiple for-loops")


return Ltrain
return Ltrain




def train_architecture(forward_pass_only, train_or_val='val'):
def train_architecture(forward_pass_only, train_or_val='val'):
print("Entering train_architecture(), forward_pass_only = ", forward_pass_only, " , train_or_val = ", train_or_val)
print("Entering train_architecture(), forward_pass_only = ", forward_pass_only, " , train_or_val = ", train_or_val)


graph = Graph()
graph = Graph()


if USE_CUDA:
if USE_CUDA:
graph = graph.cuda()
graph = graph.cuda()


criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss()
optimizer2 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
optimizer2 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)


# just for initialization, no special meaning
# just for initialization, no special meaning
Lval = 0
Lval = 0
train_inputs = 0
train_inputs = 0
train_labels = 0
train_labels = 0
val_inputs = 0
val_inputs = 0
val_labels = 0
val_labels = 0


if forward_pass_only == 0:
if forward_pass_only == 0:
# do train thing for internal NN function weights
# do train thing for internal NN function weights
graph.train()
graph.train()


# zero the parameter gradients
# zero the parameter gradients
optimizer2.zero_grad()
optimizer2.zero_grad()


print("before multiple for-loops")
print("before multiple for-loops")


for train_data, val_data in (zip(trainloader, valloader)):
for train_data, val_data in (zip(trainloader, valloader)):


train_inputs, train_labels = train_data
train_inputs, train_labels = train_data
val_inputs, val_labels = val_data
val_inputs, val_labels = val_data


if USE_CUDA:
if USE_CUDA:
train_inputs = train_inputs.cuda()
train_inputs = train_inputs.cuda()
train_labels = train_labels.cuda()
train_labels = train_labels.cuda()
val_inputs = val_inputs.cuda()
val_inputs = val_inputs.cuda()
val_labels = val_labels.cuda()
val_labels = val_labels.cuda()


for epoch in range(NUM_EPOCHS):
for epoch in range(NUM_EPOCHS):


# forward pass
# forward pass
# use linear transformation ('weighted sum then concat') to combine results from different nodes
# use linear transformation ('weighted sum then concat') to combine results from different nodes
# into an output feature map to be fed into the next neighbour node for further processing
# into an output feature map to be fed into the next neighbour node for further processing
for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
x = 0 # depends on the input tensor dimension requirement
x = 0 # depends on the input tensor dimension requirement


if c == 0:
if c == 0:
if train_or_val == 'val':
if train_or_val == 'val':
x = val_inputs
x = val_inputs


else:
else:
x = train_inputs
x = train_inputs


else:
else:
# Uses feature map output from previous neighbour node for further processing
# Uses feature map output from previous neighbour node for further processing
x = graph.cells[c].nodes[n-1].connections[cc].combined_feature_map
x = graph.cells[c].nodes[n-1].connections[cc].combined_feature_map


# need to take care of tensors dimension mismatch
# need to take care of tensors dimension mismatch
graph.cells[c].nodes[n].connections[cc].combined_feature_map += \
graph.cells[c].nodes[n].connections[cc].combined_feature_map = \
graph.cells[c].nodes[n].connections[cc].combined_feature_map + \
graph.cells[c].nodes[n].connections[cc].edges[e].forward_edge(x) # Lval(w*, alpha)
graph.cells[c].nodes[n].connections[cc].edges[e].forward_edge(x) # Lval(w*, alpha)


output2_tensor = graph.cells[NUM_OF_CELLS-1].output
output2_tensor = graph.cells[NUM_OF_CELLS-1].output
output2_tensor = output2_tensor.view(output2_tensor.shape[0], -1)
output2_tensor = output2_tensor.view(output2_tensor.shape[0], -1)


if USE_CUDA:
if USE_CUDA:
output2_tensor = output2_tensor.cuda()
output2_tensor = output2_tensor.cuda()


if USE_CUDA:
if USE_CUDA:
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES).cuda()
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES).cuda()


else:
else:
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)


outputs2 = m_linear(output2_tensor)
outputs2 = m_linear(output2_tensor)


if USE_CUDA:
if USE_CUDA:
outputs2 = outputs2.cuda()
outputs2 = outputs2.cuda()


print("outputs2.size() = ", outputs2.size())
print("outputs2.size() = ", outputs2.size())
print("val_labels.size() = ", val_labels.size())
print("val_labels.size() = ", val_labels.size())
print("train_labels.size() = ", train_labels.size())
print("train_labels.size() = ", train_labels.size())


if train_or_val == 'val':
if train_or_val == 'val':
loss = criterion(outputs2, val_labels)
loss = criterion(outputs2, val_labels)


else:
else:
loss = criterion(outputs2, train_labels)
loss = criterion(outputs2, train_labels)


if forward_pass_only == 0:
if forward_pass_only == 0:
# backward pass
# backward pass
Lval = loss
Lval = loss
Lval = Lval.requires_grad_()
Lval = Lval.requires_grad_()
Lval.backward()
Lval.backward()


for name, param in graph.named_parameters():
for name, param in graph.named_parameters():
print(name, param.grad)
print(name, param.grad)


optimizer2.step()
optimizer2.step()


else:
else:
# no need to save model parameters for next epoch
# no need to save model parameters for next epoch
return loss
return loss


# needs to save intermediate trained model for Lval
# needs to save intermediate trained model for Lval
path = './model.pth'
path = './model.pth'
torch.save(graph, path)
torch.save(graph, path)


# DARTS's approximate architecture gradient. Refer to equation (8) and https://i.imgur.com/81JFaWc.png
# DARTS's approximate architecture gradient. Refer to equation (8) and https://i.imgur.com/81JFaWc.png
sigma = LEARNING_RATE
sigma = LEARNING_RATE
epsilon = 0.01 / torch.norm(Lval)
epsilon = 0.01 / torch.norm(Lval)


for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
CC = graph.cells[c].nodes[n].connections[cc]
CC = graph.cells[c].nodes[n].connections[cc]


for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
# https://mythrex.github.io/math_behind_darts/
# https://mythrex.github.io/math_behind_darts/
# Finite Difference Method
# Finite Difference Method
CC.weight_plus = w + epsilon * Lval
CC.weight_plus = w + epsilon * Lval
CC.weight_minus = w - epsilon * Lval
CC.weight_minus = w - epsilon * Lval


# Backups original f_weights
# Backups original f_weights
CC.f_weights_backup = w
CC.f_weights_backup = w


# replaces f_weights with weight_plus before NN training
# replaces f_weights with weight_plus before NN training
for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
CC = graph.cells[c].nodes[n].connections[cc]
CC = graph.cells[c].nodes[n].connections[cc]


for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters():
w = CC.weight_plus

# test NN to obtain loss
Ltrain_plus = train_architecture(forward_pass_only=1, train_or_val='train')


# replaces f_weights with weight_minus before NN training
for c in range(NUM_OF_CELLS):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# not all nodes have same number of Type-1 output connection
for cc