gdas diff for train_architecture()

Created Diff never expires
33 removals
Lines
Total
Removed
Words
Total
Removed
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
620 lines
30 additions
Lines
Total
Added
Words
Total
Added
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
613 lines
# https://github.com/D-X-Y/AutoDL-Projects/issues/99
# https://github.com/D-X-Y/AutoDL-Projects/issues/99


import torch
import torch
import torch.utils.data
import torch.utils.data
import torch.nn as nn
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.functional as F
import torch.optim as optim
import torch.optim as optim


import torchvision
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms as transforms


import tensorflow as tf
import tensorflow as tf


# import numpy as np
# import numpy as np


VISUALIZER = 0
VISUALIZER = 0
DEBUG = 0
DEBUG = 0
logdir = 'runs/gdas_experiment_1'
logdir = 'runs/gdas_experiment_1'


if VISUALIZER:
if VISUALIZER:
# https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html
# https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html
from torch.utils.tensorboard import SummaryWriter
from torch.utils.tensorboard import SummaryWriter


# from tensorboardX import SummaryWriter
# from tensorboardX import SummaryWriter


# default `log_dir` is "runs" - we'll be more specific here
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter(logdir)
writer = SummaryWriter(logdir)


# https://github.com/szagoruyko/pytorchviz
# https://github.com/szagoruyko/pytorchviz
from torchviz import make_dot
from torchviz import make_dot


if DEBUG:
if DEBUG:
torch.autograd.set_detect_anomaly(True)
torch.autograd.set_detect_anomaly(True)
tf.debugging.experimental.enable_dump_debug_info(logdir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1)
tf.debugging.experimental.enable_dump_debug_info(logdir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1)


USE_CUDA = torch.cuda.is_available()
USE_CUDA = torch.cuda.is_available()


# https://arxiv.org/pdf/1806.09055.pdf#page=12
# https://arxiv.org/pdf/1806.09055.pdf#page=12
TEST_DATASET_RATIO = 0.5 # 50 percent of the dataset is dedicated for testing purpose
TEST_DATASET_RATIO = 0.5 # 50 percent of the dataset is dedicated for testing purpose
BATCH_SIZE = 16
BATCH_SIZE = 16
NUM_OF_IMAGE_CHANNELS = 3 # RGB
NUM_OF_IMAGE_CHANNELS = 3 # RGB
IMAGE_HEIGHT = 32
IMAGE_HEIGHT = 32
IMAGE_WIDTH = 32
IMAGE_WIDTH = 32
NUM_OF_IMAGE_CLASSES = 10
NUM_OF_IMAGE_CLASSES = 10


SIZE_OF_HIDDEN_LAYERS = 64
SIZE_OF_HIDDEN_LAYERS = 64
NUM_EPOCHS = 1
NUM_EPOCHS = 1
LEARNING_RATE = 0.025
LEARNING_RATE = 0.025
MOMENTUM = 0.9
MOMENTUM = 0.9
DECAY_FACTOR = 0.001 # for keeping Ltrain within acceptable range
DECAY_FACTOR = 0.001 # for keeping Ltrain within acceptable range
NUM_OF_CELLS = 8
NUM_OF_CELLS = 8
NUM_OF_MIXED_OPS = 4
NUM_OF_MIXED_OPS = 4
MIXED_OPS_TENSOR_SHAPE = 4 # shape of the computational kernel used inside each mixed ops
MIXED_OPS_TENSOR_SHAPE = 4 # shape of the computational kernel used inside each mixed ops
NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2 # last_cell_output , second_last_cell_output
NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2 # last_cell_output , second_last_cell_output
NUM_OF_NODES_IN_EACH_CELL = 5 # including the last node that combines the output from all 4 previous nodes
NUM_OF_NODES_IN_EACH_CELL = 5 # including the last node that combines the output from all 4 previous nodes
MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL
MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL
NUM_OF_CHANNELS = 16
NUM_OF_CHANNELS = 16
INTERVAL_BETWEEN_REDUCTION_CELLS = 3
INTERVAL_BETWEEN_REDUCTION_CELLS = 3
PREVIOUS_PREVIOUS = 2 # (n-2)
PREVIOUS_PREVIOUS = 2 # (n-2)
REDUCTION_STRIDE = 2
REDUCTION_STRIDE = 2
NORMAL_STRIDE = 1
NORMAL_STRIDE = 1
TAU_GUMBEL = 0.5
TAU_GUMBEL = 0.5
EDGE_WEIGHTS_NETWORK_IN_SIZE = 5
EDGE_WEIGHTS_NETWORK_IN_SIZE = 5
EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2
EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2


# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
transform = transforms.Compose(
transform = transforms.Compose(
[transforms.ToTensor(),
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
shuffle=True, num_workers=2)
shuffle=True, num_workers=2)


valset = torchvision.datasets.CIFAR10(root='./data', train=False,
valset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
download=True, transform=transform)
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
shuffle=False, num_workers=2)
shuffle=False, num_workers=2)


classes = ('plane', 'car', 'bird', 'cat',
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO))
TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO))




# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433
# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433
# for training for edge weights as well as internal NN function weights
# for training for edge weights as well as internal NN function weights
class Edge(nn.Module):
class Edge(nn.Module):


def __init__(self):
def __init__(self):
super(Edge, self).__init__()
super(Edge, self).__init__()


# https://stackoverflow.com/a/51027227/8776167
# https://stackoverflow.com/a/51027227/8776167
# self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE)
# self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE)
# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html
# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html
self.weights = nn.Parameter(torch.zeros(1),
self.weights = nn.Parameter(torch.zeros(1),
requires_grad=True) # for edge weights, not for internal NN function weights
requires_grad=True) # for edge weights, not for internal NN function weights


# for approximate architecture gradient
# for approximate architecture gradient
self.f_weights = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.f_weights = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.f_weights_backup = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.f_weights_backup = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.weight_plus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.weight_plus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.weight_minus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)
self.weight_minus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True)


def __freeze_w(self):
def __freeze_w(self):
self.weights.requires_grad = False
self.weights.requires_grad = False


def __unfreeze_w(self):
def __unfreeze_w(self):
self.weights.requires_grad = True
self.weights.requires_grad = True


def __freeze_f(self):
def __freeze_f(self):
for param in self.f.parameters():
for param in self.f.parameters():
param.requires_grad = False
param.requires_grad = False


def __unfreeze_f(self):
def __unfreeze_f(self):
for param in self.f.parameters():
for param in self.f.parameters():
param.requires_grad = True
param.requires_grad = True


# for NN functions internal weights training
# for NN functions internal weights training
def forward_f(self, x):
def forward_f(self, x):
self.__unfreeze_f()
self.__unfreeze_f()
self.__freeze_w()
self.__freeze_w()


# inheritance in python classes and SOLID principles
# inheritance in python classes and SOLID principles
# https://en.wikipedia.org/wiki/SOLID
# https://en.wikipedia.org/wiki/SOLID
# https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html
# https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html
return self.f(x)
return self.f(x)


# self-defined initial NAS architecture, for supernet architecture edge weight training
# self-defined initial NAS architecture, for supernet architecture edge weight training
def forward_edge(self, x):
def forward_edge(self, x):
self.__freeze_f()
self.__freeze_f()
self.__unfreeze_w()
self.__unfreeze_w()


# Refer to GDAS equations (5) and (6)
# Refer to GDAS equations (5) and (6)
# if one_hot is already there, would summation be required given that all other entries are forced to 0 ?
# if one_hot is already there, would summation be required given that all other entries are forced to 0 ?
# It's not required, but you don't know, which index is one hot encoded 1.
# It's not required, but you don't know, which index is one hot encoded 1.
# https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax
# https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax
# See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163
# See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163


gumbel = F.gumbel_softmax(x, tau=TAU_GUMBEL, hard=True)
gumbel = F.gumbel_softmax(x, tau=TAU_GUMBEL, hard=True)
chosen_edge = torch.argmax(gumbel, dim=0) # converts one-hot encoding into integer
chosen_edge = torch.argmax(gumbel, dim=0) # converts one-hot encoding into integer


return chosen_edge
return chosen_edge


def forward(self, x, types):
def forward(self, x, types):
y_hat = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], requires_grad=False)
y_hat = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], requires_grad=False)
if USE_CUDA:
if USE_CUDA:
y_hat = y_hat.cuda()
y_hat = y_hat.cuda()


if types == "f":
if types == "f":
y_hat = self.forward_f(x)
y_hat = self.forward_f(x)


elif types == "edge":
elif types == "edge":
y_hat = self.forward_edge(x)
y_hat = self.forward_edge(x)


return y_hat
return y_hat




class ConvEdge(Edge):
class ConvEdge(Edge):
def __init__(self, stride):
def __init__(self, stride):
super().__init__()
super().__init__()
self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1)
self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1)


# Kaiming He weight Initialization
# Kaiming He weight Initialization
# https://medium.com/@shoray.goel/kaiming-he-initialization-a8d9ed0b5899
# https://medium.com/@shoray.goel/kaiming-he-initialization-a8d9ed0b5899
nn.init.kaiming_uniform_(self.f.weight, mode='fan_in', nonlinearity='relu')
nn.init.kaiming_uniform_(self.f.weight, mode='fan_in', nonlinearity='relu')


# class LinearEdge(Edge):
# class LinearEdge(Edge):
# def __init__(self):
# def __init__(self):
# super().__init__()
# super().__init__()
# self.f = nn.Linear(84, 10)
# self.f = nn.Linear(84, 10)




class MaxPoolEdge(Edge):
class MaxPoolEdge(Edge):
def __init__(self, stride):
def __init__(self, stride):
super().__init__()
super().__init__()
self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)
self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)




class AvgPoolEdge(Edge):
class AvgPoolEdge(Edge):
def __init__(self, stride):
def __init__(self, stride):
super().__init__()
super().__init__()
self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)
self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True)




class Skip(nn.Module):
class Skip(nn.Module):
def forward(self, x):
def forward(self, x):
return x
return x




class SkipEdge(Edge):
class SkipEdge(Edge):
def __init__(self):
def __init__(self):
super().__init__()
super().__init__()
self.f = Skip()
self.f = Skip()




# to collect and manage different edges between 2 nodes
# to collect and manage different edges between 2 nodes
class Connection(nn.Module):
class Connection(nn.Module):
def __init__(self, stride):
def __init__(self, stride):
super(Connection, self).__init__()
super(Connection, self).__init__()


if USE_CUDA:
if USE_CUDA:
# creates distinct edges and references each of them in a list (self.edges)
# creates distinct edges and references each of them in a list (self.edges)
# self.linear_edge = LinearEdge().cuda()
# self.linear_edge = LinearEdge().cuda()
self.conv2d_edge = ConvEdge(stride).cuda()
self.conv2d_edge = ConvEdge(stride).cuda()
self.maxpool_edge = MaxPoolEdge(stride).cuda()
self.maxpool_edge = MaxPoolEdge(stride).cuda()
self.avgpool_edge = AvgPoolEdge(stride).cuda()
self.avgpool_edge = AvgPoolEdge(stride).cuda()
self.skip_edge = SkipEdge().cuda()
self.skip_edge = SkipEdge().cuda()


else:
else:
# creates distinct edges and references each of them in a list (self.edges)
# creates distinct edges and references each of them in a list (self.edges)
# self.linear_edge = LinearEdge()
# self.linear_edge = LinearEdge()
self.conv2d_edge = ConvEdge(stride)
self.conv2d_edge = ConvEdge(stride)
self.maxpool_edge = MaxPoolEdge(stride)
self.maxpool_edge = MaxPoolEdge(stride)
self.avgpool_edge = AvgPoolEdge(stride)
self.avgpool_edge = AvgPoolEdge(stride)
self.skip_edge = SkipEdge()
self.skip_edge = SkipEdge()


# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]
# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]
# python list will break the computation graph, need to use nn.ModuleList as a differentiable python list
# python list will break the computation graph, need to use nn.ModuleList as a differentiable python list
self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge])
self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge])
self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True)
# self.edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
# self.edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
# requires_grad=False)
# requires_grad=False)


# use linear transformation (weighted summation) to combine results from different edges
# use linear transformation (weighted summation) to combine results from different edges
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)
self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)


if USE_CUDA:
if USE_CUDA:
self.combined_feature_map = self.combined_feature_map.cuda()
self.combined_feature_map = self.combined_feature_map.cuda()
self.combined_edge_map = self.combined_edge_map.cuda()
self.combined_edge_map = self.combined_edge_map.cuda()


for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
with torch.no_grad():
with torch.no_grad():
self.edge_weights[e] = self.edges[e].weights
self.edge_weights[e] = self.edges[e].weights


# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions
# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions
# self.f_weights[e] = list(self.edges[e].parameters())
# self.f_weights[e] = list(self.edges[e].parameters())


def reinit(self):
def reinit(self):
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)
self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)

if USE_CUDA:
if USE_CUDA:
self.combined_feature_map = self.combined_feature_map.cuda()
self.combined_feature_map = self.combined_feature_map.cuda()
self.combined_edge_map = self.combined_edge_map.cuda()


# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# Tensorboard visualization requires a generic forward() function
# Tensorboard visualization requires a generic forward() function
def forward(self, x, types=None):
def forward(self, x, types=None):
edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)
if USE_CUDA:
if USE_CUDA:
edges_results = edges_results.cuda()
edges_results = edges_results.cuda()


for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):
with torch.no_grad():
with torch.no_grad():
edges_results = edges_results + self.edges[e].forward(x, types)
edges_results = edges_results + self.edges[e].forward(x, types)


return edges_results * DECAY_FACTOR
return edges_results * DECAY_FACTOR




# to collect and manage multiple different connections between a particular node and its neighbouring nodes
# to collect and manage multiple different connections between a particular node and its neighbouring nodes
class Node(nn.Module):
class Node(nn.Module):
def __init__(self, stride):
def __init__(self, stride):
super(Node, self).__init__()
super(Node, self).__init__()


# two types of output connections
# two types of output connections
# Type 1: (multiple edges) output connects to the input of the other intermediate nodes
# Type 1: (multiple edges) output connects to the input of the other intermediate nodes
# Type 2: (single edge) output connects directly to the final output node
# Type 2: (single edge) output connects directly to the final output node


# Type 1
# Type 1
self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)])
self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)])


# Type 2
# Type 2
# depends on PREVIOUS node's Type 1 output
# depends on PREVIOUS node's Type 1 output
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False) # for initialization
requires_grad=False) # for initialization


if USE_CUDA:
if USE_CUDA:
self.output = self.output.cuda()
self.output = self.output.cuda()


def reinit(self):
def reinit(self):
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)
if USE_CUDA:
if USE_CUDA:
self.output = self.output.cuda()
self.output = self.output.cuda()


# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# Tensorboard visualization requires a generic forward() function
# Tensorboard visualization requires a generic forward() function
def forward(self, x, node_num=0, types=None):
def forward(self, x, node_num=0, types=None):
value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)


# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - node_num - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - node_num - 1):
y = self.connections[cc].forward(x, types)
y = self.connections[cc].forward(x, types)


# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.
# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.
# Tensorflow prefers the use of a new intermediate variable instead of self.variable
# Tensorflow prefers the use of a new intermediate variable instead of self.variable
value = self.connections[cc].combined_feature_map
if types == "f":
value = self.connections[cc].combined_feature_map

else: # "edge"
value = self.connections[cc].combined_edge_map


# combines all the feature maps from different mixed ops edges
# combines all the feature maps from different mixed ops edges
value = value + y # Ltrain(w±, alpha)
value = value + y # Ltrain(w±, alpha)


# stores the addition result for next for loop index
# stores the addition result for next for loop index
self.connections[cc].combined_feature_map = value
if types == "f":
self.connections[cc].combined_feature_map = value


decayed_value = value * DECAY_FACTOR
else: # "edge"
self.connections[cc].combined_edge_map = value


if USE_CUDA:
decayed_value = value * DECAY_FACTOR
decayed_value = decayed_value.cuda()

if USE_CUDA:
decayed_value = decayed_value.cuda()


return decayed_value
return decayed_value




# to manage all nodes within a cell
# to manage all nodes within a cell
class Cell(nn.Module):
class Cell(nn.Module):
def __init__(self, stride):
def __init__(self, stride):
super(Cell, self).__init__()
super(Cell, self).__init__()


# all the coloured edges inside
# all the coloured edges inside
# https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png
# https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png
# A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes
# A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes
# for the k-th node, we have (k+1) preceding nodes.
# for the k-th node, we have (k+1) preceding nodes.
# Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'),
# Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'),
# is connected to each previous intermediate state
# is connected to each previous intermediate state
# as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer).
# as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer).
# previous_previous_cell_output = c_{k-2}
# previous_previous_cell_output = c_{k-2}
# previous_cell_output = c{k-1}
# previous_cell_output = c{k-1}
self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)])
self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)])


# just for variables initialization
# just for variables initialization
self.previous_cell = 0
self.previous_cell = 0
self.previous_previous_cell = 0
self.previous_previous_cell = 0
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)


if USE_CUDA:
if USE_CUDA:
self.output = self.output.cuda()
self.output = self.output.cuda()


def reinit(self):
def reinit(self):
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)
if USE_CUDA:
if USE_CUDA:
self.output = self.output.cuda()
self.output = self.output.cuda()


# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# Tensorboard visualization requires a generic forward() function
# Tensorboard visualization requires a generic forward() function
def forward(self, x, x1, x2, c=0, types=None):
def forward(self, x, x1, x2, c=0, types=None):


value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH],
requires_grad=False)
requires_grad=False)


for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
if c <= 1:
if c <= 1:
if n == 0:
if n == 0:
# Uses datasets as input
# Uses datasets as input
# x = train_inputs
# x = train_inputs


if USE_CUDA:
if USE_CUDA:
x = x.cuda()
x = x.cuda()


# combines all the feature maps from different mixed ops edges
# combines all the feature maps from different mixed ops edges
self.nodes[n].output = \
self.nodes[n].output = \
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)


else:
else:
# Uses feature map output from previous neighbour nodes for further processing
# Uses feature map output from previous neighbour nodes for further processing
for ni in range(n):
for ni in range(n):
# nodes[ni] for previous nodes only
# nodes[ni] for previous nodes only
# connections[n-ni-1] for neighbour nodes only
# connections[n-ni-1] for neighbour nodes only
x = self.nodes[ni].connections[n-ni-1].combined_feature_map
if types == "f":
x = self.nodes[ni].connections[n-ni-1].combined_feature_map

else: # "edge"
x = self.nodes[ni].connections[n-ni-1].combined_edge_map


# combines all the feature maps from different mixed ops edges
# combines all the feature maps from different mixed ops edges
self.nodes[n].output = self.nodes[n].output + \
self.nodes[n].output = self.nodes[n].output + \
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)


else:
else:
if n == 0:
if n == 0:
# Uses feature map output from previous neighbour cells for further processing
# Uses feature map output from previous neighbour cells for further processing
self.nodes[n].output = \
self.nodes[n].output = \
self.nodes[n].forward(x1, node_num=n, types=types) + \
self.nodes[n].forward(x1, node_num=n, types=types) + \
self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha)
self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha)


else:
else:
# Uses feature map output from previous neighbour nodes for further processing
# Uses feature map output from previous neighbour nodes for further processing
for ni in range(n):
for ni in range(n):
# nodes[ni] for previous nodes only
# nodes[ni] for previous nodes only
# connections[n-ni-1] for neighbour nodes only
# connections[n-ni-1] for neighbour nodes only
x = self.nodes[ni].connections[n-ni-1].combined_feature_map
if types == "f":
x = self.nodes[ni].connections[n-ni-1].combined_feature_map

else: # "edge"
x = self.nodes[ni].connections[n-ni-1].combined_edge_map


# combines all the feature maps from different mixed ops edges
# combines all the feature maps from different mixed ops edges
self.nodes[n].output = self.nodes[n].output + \
self.nodes[n].output = self.nodes[n].output + \
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha)


# Uses feature map output from previous neighbour cells for further processing
# Uses feature map output from previous neighbour cells for further processing
self.nodes[n].output = self.nodes[n].output + \
self.nodes[n].output = self.nodes[n].output + \
self.nodes[n].forward(x1, node_num=n, types=types) + \
self.nodes[n].forward(x1, node_num=n, types=types) + \
self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha)
self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha)


# 'add' then 'concat' feature maps from different nodes
# 'add' then 'concat' feature maps from different nodes
# needs to take care of tensor dimension mismatch
# needs to take care of tensor dimension mismatch
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416
# self.output = self.output + self.nodes[n].output
# self.output = self.output + self.nodes[n].output


# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.
# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch.
# Tensorflow prefers the use of a new intermediate variable instead of self.variable
# Tensorflow prefers the use of a new intermediate variable instead of self.variable
value = self.output
value = self.output


if USE_CUDA:
if USE_CUDA:
self.nodes[n].output = self.nodes[n].output.cuda()
self.nodes[n].output = self.nodes[n].output.cuda()
value = value.cuda()
value = value.cuda()


value = value + self.nodes[n].output
value = value + self.nodes[n].output
self.output = value
self.output = value




# to manage all nodes
# to manage all nodes
class Graph(nn.Module):
class Graph(nn.Module):
def __init__(self):
def __init__(self):
super(Graph, self).__init__()
super(Graph, self).__init__()


stride = 1 # just to initialize a variable
stride = 1 # just to initialize a variable


# for i in range(NUM_OF_CELLS):
# for i in range(NUM_OF_CELLS):
# if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0:
# if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0:
# stride = REDUCTION_STRIDE # to emulate reduction cell by using normal cell with stride=2
# stride = REDUCTION_STRIDE # to emulate reduction cell by using normal cell with stride=2
# else:
# else:
# stride = NORMAL_STRIDE # normal cell
# stride = NORMAL_STRIDE # normal cell


self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)])
self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)])


self.linears = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)
self.linears = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES)


self.softmax = nn.Softmax(1)
self.softmax = nn.Softmax(1)


def reinit(self):
def reinit(self):
# See https://discuss.pytorch.org/t/tensorboard-issue-with-self-defined-forward-function/140628/20?u=promach
# See https://discuss.pytorch.org/t/tensorboard-issue-with-self-defined-forward-function/140628/20?u=promach
for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
self.cells[c].reinit()
self.cells[c].reinit()


for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
self.cells[c].nodes[n].reinit()
self.cells[c].nodes[n].reinit()


# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
self.cells[c].nodes[n].connections[cc].reinit()
self.cells[c].nodes[n].connections[cc].reinit()


def print_debug(self):
def print_debug(self):
for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
for n in range(NUM_OF_NODES_IN_EACH_CELL):
# not all nodes have same number of Type-1 output connection
# not all nodes have same number of Type-1 output connection
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1):
for e in range(NUM_OF_MIXED_OPS):
for e in range(NUM_OF_MIXED_OPS):


if DEBUG:
if DEBUG:
print("c = ", c, " , n = ", n, " , cc = ", cc, " , e = ", e)
print("c = ", c, " , n = ", n, " , cc = ", cc, " , e = ", e)


print("graph.cells[", c, "].nodes[", n, "].connections[", cc,
print("graph.cells[", c, "].nodes[", n, "].connections[", cc,
"].combined_feature_map.grad_fn = ",
"].combined_feature_map.grad_fn = ",
self.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn)
self.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn)


print("graph.cells[", c, "].output.grad_fn = ",
print("graph.cells[", c, "].output.grad_fn = ",
self.cells[c].output.grad_fn)
self.cells[c].output.grad_fn)


print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ",
print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ",
self.cells[c].nodes[n].output.grad_fn)
self.cells[c].nodes[n].output.grad_fn)


if VISUALIZER == 0:
if VISUALIZER == 0:
self.cells[c].nodes[n].output.retain_grad()
self.cells[c].nodes[n].output.retain_grad()
print("gradwalk(graph.cells[", c, "].nodes[", n, "].output.grad_fn)")
print("gradwalk(graph.cells[", c, "].nodes[", n, "].output.grad_fn)")
# gradwalk(graph.cells[c].nodes[n].output.grad_fn)
# gradwalk(graph.cells[c].nodes[n].output.grad_fn)


if DEBUG:
if DEBUG:
print("graph.cells[", c, "].output.grad_fn = ",
print("graph.cells[", c, "].output.grad_fn = ",
self.cells[c].output.grad_fn)
self.cells[c].output.grad_fn)


if VISUALIZER == 0:
if VISUALIZER == 0:
self.cells[c].output.retain_grad()
self.cells[c].output.retain_grad()
print("gradwalk(graph.cells[", c, "].output.grad_fn)")
print("gradwalk(graph.cells[", c, "].output.grad_fn)")
# gradwalk(graph.cells[c].output.grad_fn)
# gradwalk(graph.cells[c].output.grad_fn)


# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/
# Tensorboard visualization requires a generic forward() function
# Tensorboard visualization requires a generic forward() function
def forward(self, x, types=None):
def forward(self, x, types=None):


# train_inputs = x
# train_inputs = x


# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/
# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/
# https://docs.python.org/3/tutorial/datastructures.html
# https://docs.python.org/3/tutorial/datastructures.html


# generates a supernet consisting of 'NUM_OF_CELLS' cells
# generates a supernet consisting of 'NUM_OF_CELLS' cells
# each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes
# each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes
# refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement
# refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement
# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html


# encodes the cells and nodes arrangement in the multigraph
# encodes the cells and nodes arrangement in the multigraph


outputs1 = 0 # just for initialization, no special meaning
outputs1 = 0 # just for initialization, no special meaning


for c in range(NUM_OF_CELLS):
for c in range(NUM_OF_CELLS):
x1 = self.cells[c - 1].output
x1 = self.cells[c - 1].output
x2 = self.cells[c - PREVIOUS_PREVIOUS].output
x2 = self.cells[c - PREVIOUS_PREVIOUS].output


self.cells[c].forward(x, x1, x2, c, types=types)
self.cells[c].forward(x, x1, x2, c, types=types)


output_tensor = self.cells[NUM_OF_CELLS - 1].output
output_tensor = self.cells[NUM_OF_CELLS - 1].output
output_tensor = output_tensor.view(output_tensor.shape[0], -1)
output_tensor = output_tensor.view(output_tensor.shape[0], -1)


if USE_CUDA:
if USE_CUDA:
output_tensor = output_tensor.cuda()
output_tensor = output_tensor.cuda()


if DEBUG and VISUALIZER == 0:
if DEBUG and VISUALIZER == 0:
print("gradwalk(output_tensor.grad_fn)")
print("gradwalk(output_tensor.grad_fn)")
# gradwalk(output_tensor.grad_fn)
# gradwalk(output_tensor.grad_fn)


if USE_CUDA:
if USE_CUDA:
outputs1 = self.linears(output_tensor).cuda()
outputs1 = self.linears(output_tensor).cuda()


else:
else:
outputs1 = self.linears(output_tensor)
outputs1 = self.linears(output_tensor)
outputs1 = self.softmax(outputs1)
outputs1 = self.softmax(outputs1)


if USE_CUDA:
if USE_CUDA:
outputs1 = outputs1.cuda()
outputs1 = outputs1.cuda()


return outputs1
return outputs1




total_grad_out = []
total_grad_out = []
total_grad_in = []
total_grad_in = []




def hook_fn_backward(module, grad_input, grad_output):
def hook_fn_backward(module, grad_input, grad_output):
print(module) # for distinguishing module
print(module) # for distinguishing module


# In order to comply with the order back-propagation, let's print grad_output
# In order to comply with the order back-propagation, let's print grad_output
print('grad_output', grad_output)
print('grad_output', grad_output)


# Reprint grad_input
# Reprint grad_input
print('grad_input', grad_input)
print('grad_input', grad_input)


# Save to global variables
# Save to global variables
total_grad_in.append(grad_input)
total_grad_in.append(grad_input)
total_grad_out.append(grad_output)
total_grad_out.append(grad_output)




# for tracking the gradient back-propagation operation
# for tracking the gradient back-propagation operation
def gradwalk(x, _depth=0):
def gradwalk(x, _depth=0):
if hasattr(x, 'grad'):
if hasattr(x, 'grad'):
x = x.grad
x = x.grad


if hasattr(x, 'next_functions'):
if hasattr(x, 'next_functions'):
for fn in x.next_functions:
for fn in x.next_functions:
print(' ' * _depth + str(fn))
print(' ' * _depth + str(fn))
gradwalk(fn[0], _depth + 1)
gradwalk(fn[0], _depth + 1)




# Function to Convert to ONNX
# Function to Convert to ONNX
def Convert_ONNX(model, model_input):
def Convert_ONNX(model, model_input):


# Export the model
# Export the model
torch.onnx.export(model, # model being run
torch.onnx.export(model, # model being run
model_input, # model input (or a tuple for multiple inputs)
model_input, # model input (or a tuple for multiple inputs)
"gdas.onnx", # where to save the model
"gdas.onnx", # where to save the model
export_params=True, # store the trained parameter weights inside the model file
export_params=True, # store the trained parameter weights inside the model file
opset_version=10, # the ONNX version to export the model to
opset_version=10, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['modelInput'], # the model's input names
input_names = ['modelInput'], # the model's input names
output_names = ['modelOutput'], # the model's output names
output_names = ['modelOutput'], # the model's output names
dynamic_axes={'modelInput': {0: 'batch_size'}, # variable length axes
dynamic_axes={'modelInput': {0: 'batch_size'}, # variable length axes
'modelOutput': {0: 'batch_size'}})
'modelOutput': {0: 'batch_size'}})
print(" ")
print(" ")
print('Model has been converted to ONNX')
print('Model has been converted to ONNX')




# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html
# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html
def train_NN(forward_pass_only):
def train_NN(forward_pass_only):
if DEBUG:
if DEBUG:
print("Entering train_NN(), forward_pass_only = ", forward_pass_only)
print("Entering train_NN(), forward_pass_only = ", forward_pass_only)


graph = Graph()
graph = Graph()


if USE_CUDA:
if USE_CUDA:
graph = graph.cuda()
graph = graph.cuda()


if DEBUG:
if DEBUG:
modules = graph.named_children()
modules = graph.named_children()
print("modules = ", modules)

if VISUALIZER == 0:
# Tensorboard does not like backward hook
for name, module in graph.named_modules():
module.register_full_backward_hook(hook_fn_backward)

criterion = nn.CrossEntropyLoss()
# criterion = nn.BCELoss()
optimizer1 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

# just for initialization, no special meaning
Ltrain = 0
NN_input = 0
NN_output = torch.tensor(0)
NN_train_labels = 0

for train_data, val_data in (zip(trainloader, valloader)):

NN_input, NN_train_labels = train_data
# val_inputs, val_labels = val_data

if USE_CUDA:
NN_input = NN_input.cuda()
NN_train_labels = NN_train_labels.cuda()

# normalize inputs
NN_input = NN_input / 25