Spaces:
Runtime error
Runtime error
| import time | |
| import math | |
| import numpy as np | |
| import tensorflow as tf | |
| import ops | |
| from config import config | |
| from mac_cell import MACCell | |
| ''' | |
| The MAC network model. It performs reasoning processes to answer a question over | |
| knowledge base (the image) by decomposing it into attention-based computational steps, | |
| each perform by a recurrent MAC cell. | |
| The network has three main components. | |
| Input unit: processes the network inputs: raw question strings and image into | |
| distributional representations. | |
| The MAC network: calls the MACcells (mac_cell.py) config.netLength number of times, | |
| to perform the reasoning process over the question and image. | |
| The output unit: a classifier that receives the question and final state of the MAC | |
| network and uses them to compute log-likelihood over the possible one-word answers. | |
| ''' | |
| class MACnet(object): | |
| '''Initialize the class. | |
| Args: | |
| embeddingsInit: initialization for word embeddings (random / glove). | |
| answerDict: answers dictionary (mapping between integer id and symbol). | |
| ''' | |
| def __init__(self, embeddingsInit, answerDict): | |
| self.embeddingsInit = embeddingsInit | |
| self.answerDict = answerDict | |
| self.build() | |
| ''' | |
| Initializes placeholders. | |
| questionsIndicesAll: integer ids of question words. | |
| [batchSize, questionLength] | |
| questionLengthsAll: length of each question. | |
| [batchSize] | |
| imagesPlaceholder: image features. | |
| [batchSize, channels, height, width] | |
| (converted internally to [batchSize, height, width, channels]) | |
| answersIndicesAll: integer ids of answer words. | |
| [batchSize] | |
| lr: learning rate (tensor scalar) | |
| train: train / evaluation (tensor boolean) | |
| dropout values dictionary (tensor scalars) | |
| ''' | |
| # change to H x W x C? | |
| def addPlaceholders(self): | |
| with tf.variable_scope("Placeholders"): | |
| ## data | |
| # questions | |
| self.questionsIndicesAll = tf.placeholder(tf.int32, shape = (None, None)) | |
| self.questionLengthsAll = tf.placeholder(tf.int32, shape = (None, )) | |
| # images | |
| # put image known dimension as last dim? | |
| self.imagesPlaceholder = tf.placeholder(tf.float32, shape = (None, None, None, None)) | |
| self.imagesAll = tf.transpose(self.imagesPlaceholder, (0, 2, 3, 1)) | |
| # self.imageH = tf.shape(self.imagesAll)[1] | |
| # self.imageW = tf.shape(self.imagesAll)[2] | |
| # answers | |
| self.answersIndicesAll = tf.placeholder(tf.int32, shape = (None, )) | |
| ## optimization | |
| self.lr = tf.placeholder(tf.float32, shape = ()) | |
| self.train = tf.placeholder(tf.bool, shape = ()) | |
| self.batchSizeAll = tf.shape(self.questionsIndicesAll)[0] | |
| ## dropouts | |
| # TODO: change dropouts to be 1 - current | |
| self.dropouts = { | |
| "encInput": tf.placeholder(tf.float32, shape = ()), | |
| "encState": tf.placeholder(tf.float32, shape = ()), | |
| "stem": tf.placeholder(tf.float32, shape = ()), | |
| "question": tf.placeholder(tf.float32, shape = ()), | |
| # self.dropouts["question"]Out = tf.placeholder(tf.float32, shape = ()) | |
| # self.dropouts["question"]MAC = tf.placeholder(tf.float32, shape = ()) | |
| "read": tf.placeholder(tf.float32, shape = ()), | |
| "write": tf.placeholder(tf.float32, shape = ()), | |
| "memory": tf.placeholder(tf.float32, shape = ()), | |
| "output": tf.placeholder(tf.float32, shape = ()) | |
| } | |
| # batch norm params | |
| self.batchNorm = {"decay": config.bnDecay, "train": self.train} | |
| # if config.parametricDropout: | |
| # self.dropouts["question"] = parametricDropout("qDropout", self.train) | |
| # self.dropouts["read"] = parametricDropout("readDropout", self.train) | |
| # else: | |
| # self.dropouts["question"] = self.dropouts["_q"] | |
| # self.dropouts["read"] = self.dropouts["_read"] | |
| # if config.tempDynamic: | |
| # self.tempAnnealRate = tf.placeholder(tf.float32, shape = ()) | |
| self.H, self.W, self.imageInDim = config.imageDims | |
| # Feeds data into placeholders. See addPlaceholders method for further details. | |
| def createFeedDict(self, data, images, train): | |
| feedDict = { | |
| self.questionsIndicesAll: np.array(data["question"]), | |
| self.questionLengthsAll: np.array(data["questionLength"]), | |
| self.imagesPlaceholder: images, | |
| # self.answersIndicesAll: [0], | |
| self.dropouts["encInput"]: config.encInputDropout if train else 1.0, | |
| self.dropouts["encState"]: config.encStateDropout if train else 1.0, | |
| self.dropouts["stem"]: config.stemDropout if train else 1.0, | |
| self.dropouts["question"]: config.qDropout if train else 1.0, #_ | |
| self.dropouts["memory"]: config.memoryDropout if train else 1.0, | |
| self.dropouts["read"]: config.readDropout if train else 1.0, #_ | |
| self.dropouts["write"]: config.writeDropout if train else 1.0, | |
| self.dropouts["output"]: config.outputDropout if train else 1.0, | |
| # self.dropouts["question"]Out: config.qDropoutOut if train else 1.0, | |
| # self.dropouts["question"]MAC: config.qDropoutMAC if train else 1.0, | |
| self.lr: config.lr, | |
| self.train: train | |
| } | |
| # if config.tempDynamic: | |
| # feedDict[self.tempAnnealRate] = tempAnnealRate | |
| return feedDict | |
| # Splits data to a specific GPU (tower) for parallelization | |
| def initTowerBatch(self, towerI, towersNum, dataSize): | |
| towerBatchSize = tf.floordiv(dataSize, towersNum) | |
| start = towerI * towerBatchSize | |
| end = (towerI + 1) * towerBatchSize if towerI < towersNum - 1 else dataSize | |
| self.questionsIndices = self.questionsIndicesAll[start:end] | |
| self.questionLengths = self.questionLengthsAll[start:end] | |
| self.images = self.imagesAll[start:end] | |
| self.answersIndices = self.answersIndicesAll[start:end] | |
| self.batchSize = end - start | |
| ''' | |
| The Image Input Unit (stem). Passes the image features through a CNN-network | |
| Optionally adds position encoding (doesn't in the default behavior). | |
| Flatten the image into Height * Width "Knowledge base" array. | |
| Args: | |
| images: image input. [batchSize, height, width, inDim] | |
| inDim: input image dimension | |
| outDim: image out dimension | |
| addLoc: if not None, adds positional encoding to the image | |
| Returns preprocessed images. | |
| [batchSize, height * width, outDim] | |
| ''' | |
| def stem(self, images, inDim, outDim, addLoc = None): | |
| with tf.variable_scope("stem"): | |
| if addLoc is None: | |
| addLoc = config.locationAware | |
| if config.stemLinear: | |
| features = ops.linear(images, inDim, outDim) | |
| else: | |
| dims = [inDim] + ([config.stemDim] * (config.stemNumLayers - 1)) + [outDim] | |
| if addLoc: | |
| images, inDim = ops.addLocation(images, inDim, config.locationDim, | |
| h = self.H, w = self.W, locType = config.locationType) | |
| dims[0] = inDim | |
| # if config.locationType == "PE": | |
| # dims[-1] /= 4 | |
| # dims[-1] *= 3 | |
| # else: | |
| # dims[-1] -= 2 | |
| features = ops.CNNLayer(images, dims, | |
| batchNorm = self.batchNorm if config.stemBN else None, | |
| dropout = self.dropouts["stem"], | |
| kernelSizes = config.stemKernelSizes, | |
| strides = config.stemStrideSizes) | |
| # if addLoc: | |
| # lDim = outDim / 4 | |
| # lDim /= 4 | |
| # features, _ = addLocation(features, dims[-1], lDim, h = H, w = W, | |
| # locType = config.locationType) | |
| if config.stemGridRnn: | |
| features = ops.multigridRNNLayer(features, H, W, outDim) | |
| # flatten the 2d images into a 1d KB | |
| features = tf.reshape(features, (self.batchSize, -1, outDim)) | |
| return features | |
| # Embed question using parametrized word embeddings. | |
| # The embedding are initialized to the values supported to the class initialization | |
| def qEmbeddingsOp(self, qIndices, embInit): | |
| with tf.variable_scope("qEmbeddings"): | |
| # if config.useCPU: | |
| # with tf.device('/cpu:0'): | |
| # embeddingsVar = tf.Variable(self.embeddingsInit, name = "embeddings", dtype = tf.float32) | |
| # else: | |
| # embeddingsVar = tf.Variable(self.embeddingsInit, name = "embeddings", dtype = tf.float32) | |
| embeddingsVar = tf.get_variable("emb", initializer = tf.to_float(embInit), | |
| dtype = tf.float32, trainable = (not config.wrdEmbFixed)) | |
| embeddings = tf.concat([tf.zeros((1, config.wrdEmbDim)), embeddingsVar], axis = 0) | |
| questions = tf.nn.embedding_lookup(embeddings, qIndices) | |
| return questions, embeddings | |
| # Embed answer words | |
| def aEmbeddingsOp(self, embInit): | |
| with tf.variable_scope("aEmbeddings"): | |
| if embInit is None: | |
| return None | |
| answerEmbeddings = tf.get_variable("emb", initializer = tf.to_float(embInit), | |
| dtype = tf.float32) | |
| return answerEmbeddings | |
| # Embed question and answer words with tied embeddings | |
| def qaEmbeddingsOp(self, qIndices, embInit): | |
| questions, qaEmbeddings = self.qEmbeddingsOp(qIndices, embInit["qa"]) | |
| aEmbeddings = tf.nn.embedding_lookup(qaEmbeddings, embInit["ansMap"]) | |
| return questions, qaEmbeddings, aEmbeddings | |
| ''' | |
| Embed question (and optionally answer) using parametrized word embeddings. | |
| The embedding are initialized to the values supported to the class initialization | |
| ''' | |
| def embeddingsOp(self, qIndices, embInit): | |
| if config.ansEmbMod == "SHARED": | |
| questions, qEmb, aEmb = self.qaEmbeddingsOp(qIndices, embInit) | |
| else: | |
| questions, qEmb = self.qEmbeddingsOp(qIndices, embInit["q"]) | |
| aEmb = self.aEmbeddingsOp(embInit["a"]) | |
| return questions, qEmb, aEmb | |
| ''' | |
| The Question Input Unit embeds the questions to randomly-initialized word vectors, | |
| and runs a recurrent bidirectional encoder (RNN/LSTM etc.) that gives back | |
| vector representations for each question (the RNN final hidden state), and | |
| representations for each of the question words (the RNN outputs for each word). | |
| The method uses bidirectional LSTM, by default. | |
| Optionally projects the outputs of the LSTM (with linear projection / | |
| optionally with some activation). | |
| Args: | |
| questions: question word embeddings | |
| [batchSize, questionLength, wordEmbDim] | |
| questionLengths: the question lengths. | |
| [batchSize] | |
| projWords: True to apply projection on RNN outputs. | |
| projQuestion: True to apply projection on final RNN state. | |
| projDim: projection dimension in case projection is applied. | |
| Returns: | |
| Contextual Words: RNN outputs for the words. | |
| [batchSize, questionLength, ctrlDim] | |
| Vectorized Question: Final hidden state representing the whole question. | |
| [batchSize, ctrlDim] | |
| ''' | |
| def encoder(self, questions, questionLengths, projWords = False, | |
| projQuestion = False, projDim = None): | |
| with tf.variable_scope("encoder"): | |
| # variational dropout option | |
| varDp = None | |
| if config.encVariationalDropout: | |
| varDp = {"stateDp": self.dropouts["stateInput"], | |
| "inputDp": self.dropouts["encInput"], | |
| "inputSize": config.wrdEmbDim} | |
| # rnns | |
| for i in range(config.encNumLayers): | |
| questionCntxWords, vecQuestions = ops.RNNLayer(questions, questionLengths, | |
| config.encDim, bi = config.encBi, cellType = config.encType, | |
| dropout = self.dropouts["encInput"], varDp = varDp, name = "rnn%d" % i) | |
| # dropout for the question vector | |
| vecQuestions = tf.nn.dropout(vecQuestions, self.dropouts["question"]) | |
| # projection of encoder outputs | |
| if projWords: | |
| questionCntxWords = ops.linear(questionCntxWords, config.encDim, projDim, | |
| name = "projCW") | |
| if projQuestion: | |
| vecQuestions = ops.linear(vecQuestions, config.encDim, projDim, | |
| act = config.encProjQAct, name = "projQ") | |
| return questionCntxWords, vecQuestions | |
| ''' | |
| Stacked Attention Layer for baseline. Computes interaction between images | |
| and the previous memory, and casts it back to compute attention over the | |
| image, which in turn is summed up with the previous memory to result in the | |
| new one. | |
| Args: | |
| images: input image. | |
| [batchSize, H * W, inDim] | |
| memory: previous memory value | |
| [batchSize, inDim] | |
| inDim: inputs dimension | |
| hDim: hidden dimension to compute interactions between image and memory | |
| Returns the new memory value. | |
| ''' | |
| def baselineAttLayer(self, images, memory, inDim, hDim, name = "", reuse = None): | |
| with tf.variable_scope("attLayer" + name, reuse = reuse): | |
| # projImages = ops.linear(images, inDim, hDim, name = "projImage") | |
| # projMemory = tf.expand_dims(ops.linear(memory, inDim, hDim, name = "projMemory"), axis = -2) | |
| # if config.saMultiplicative: | |
| # interactions = projImages * projMemory | |
| # else: | |
| # interactions = tf.tanh(projImages + projMemory) | |
| interactions, _ = ops.mul(images, memory, inDim, proj = {"dim": hDim, "shared": False}, | |
| interMod = config.baselineAttType) | |
| attention = ops.inter2att(interactions, hDim) | |
| summary = ops.att2Smry(attention, images) | |
| newMemory = memory + summary | |
| return newMemory | |
| ''' | |
| Baseline approach: | |
| If baselineAtt is True, applies several layers (baselineAttNumLayers) | |
| of stacked attention to image and memory, when memory is initialized | |
| to the vector questions. See baselineAttLayer for further details. | |
| Otherwise, computes result output features based on image representation | |
| (baselineCNN), or question (baselineLSTM) or both. | |
| Args: | |
| vecQuestions: question vector representation | |
| [batchSize, questionDim] | |
| questionDim: dimension of question vectors | |
| images: (flattened) image representation | |
| [batchSize, imageDim] | |
| imageDim: dimension of image representations. | |
| hDim: hidden dimension to compute interactions between image and memory | |
| (for attention-based baseline). | |
| Returns final features to use in later classifier. | |
| [batchSize, outDim] (out dimension depends on baseline method) | |
| ''' | |
| def baseline(self, vecQuestions, questionDim, images, imageDim, hDim): | |
| with tf.variable_scope("baseline"): | |
| if config.baselineAtt: | |
| memory = self.linear(vecQuestions, questionDim, hDim, name = "qProj") | |
| images = self.linear(images, imageDim, hDim, name = "iProj") | |
| for i in range(config.baselineAttNumLayers): | |
| memory = self.baselineAttLayer(images, memory, hDim, hDim, | |
| name = "baseline%d" % i) | |
| memDim = hDim | |
| else: | |
| images, imagesDim = ops.linearizeFeatures(images, self.H, self.W, | |
| imageDim, projDim = config.baselineProjDim) | |
| if config.baselineLSTM and config.baselineCNN: | |
| memory = tf.concat([vecQuestions, images], axis = -1) | |
| memDim = questionDim + imageDim | |
| elif config.baselineLSTM: | |
| memory = vecQuestions | |
| memDim = questionDim | |
| else: # config.baselineCNN | |
| memory = images | |
| memDim = imageDim | |
| return memory, memDim | |
| ''' | |
| Runs the MAC recurrent network to perform the reasoning process. | |
| Initializes a MAC cell and runs netLength iterations. | |
| Currently it passes the question and knowledge base to the cell during | |
| its creating, such that it doesn't need to interact with it through | |
| inputs / outputs while running. The recurrent computation happens | |
| by working iteratively over the hidden (control, memory) states. | |
| Args: | |
| images: flattened image features. Used as the "Knowledge Base". | |
| (Received by default model behavior from the Image Input Units). | |
| [batchSize, H * W, memDim] | |
| vecQuestions: vector questions representations. | |
| (Received by default model behavior from the Question Input Units | |
| as the final RNN state). | |
| [batchSize, ctrlDim] | |
| questionWords: question word embeddings. | |
| [batchSize, questionLength, ctrlDim] | |
| questionCntxWords: question contextual words. | |
| (Received by default model behavior from the Question Input Units | |
| as the series of RNN output states). | |
| [batchSize, questionLength, ctrlDim] | |
| questionLengths: question lengths. | |
| [batchSize] | |
| Returns the final control state and memory state resulted from the network. | |
| ([batchSize, ctrlDim], [bathSize, memDim]) | |
| ''' | |
| def MACnetwork(self, images, vecQuestions, questionWords, questionCntxWords, | |
| questionLengths, name = "", reuse = None): | |
| with tf.variable_scope("MACnetwork" + name, reuse = reuse): | |
| self.macCell = MACCell( | |
| vecQuestions = vecQuestions, | |
| questionWords = questionWords, | |
| questionCntxWords = questionCntxWords, | |
| questionLengths = questionLengths, | |
| knowledgeBase = images, | |
| memoryDropout = self.dropouts["memory"], | |
| readDropout = self.dropouts["read"], | |
| writeDropout = self.dropouts["write"], | |
| # qDropoutMAC = self.qDropoutMAC, | |
| batchSize = self.batchSize, | |
| train = self.train, | |
| reuse = reuse) | |
| state = self.macCell.zero_state(self.batchSize, tf.float32) | |
| # inSeq = tf.unstack(inSeq, axis = 1) | |
| none = tf.zeros((self.batchSize, 1), dtype = tf.float32) | |
| # for i, inp in enumerate(inSeq): | |
| for i in range(config.netLength): | |
| self.macCell.iteration = i | |
| # if config.unsharedCells: | |
| # with tf.variable_scope("iteration%d" % i): | |
| # macCell.myNameScope = "iteration%d" % i | |
| _, state = self.macCell(none, state) | |
| # else: | |
| # _, state = macCell(none, state) | |
| # macCell.reuse = True | |
| # self.autoEncMMLoss = macCell.autoEncMMLossI | |
| # inputSeqL = None | |
| # _, lastOutputs = tf.nn.dynamic_rnn(macCell, inputSeq, # / static | |
| # sequence_length = inputSeqL, | |
| # initial_state = initialState, | |
| # swap_memory = True) | |
| # self.postModules = None | |
| # if (config.controlPostRNN or config.selfAttentionMod == "POST"): # may not work well with dlogits | |
| # self.postModules, _ = self.RNNLayer(cLogits, None, config.encDim, bi = False, | |
| # name = "decPostRNN", cellType = config.controlPostRNNmod) | |
| # if config.controlPostRNN: | |
| # logits = self.postModules | |
| # self.postModules = tf.unstack(self.postModules, axis = 1) | |
| # self.autoEncCtrlLoss = tf.constant(0.0) | |
| # if config.autoEncCtrl: | |
| # autoEncCtrlCellType = ("GRU" if config.autoEncCtrlGRU else "RNN") | |
| # autoEncCtrlinp = logits | |
| # _, autoEncHid = self.RNNLayer(autoEncCtrlinp, None, config.encDim, | |
| # bi = True, name = "autoEncCtrl", cellType = autoEncCtrlCellType) | |
| # self.autoEncCtrlLoss = (tf.nn.l2_loss(vecQuestions - autoEncHid)) / tf.to_float(self.batchSize) | |
| finalControl = state.control | |
| finalMemory = state.memory | |
| return finalControl, finalMemory | |
| ''' | |
| Output Unit (step 1): chooses the inputs to the output classifier. | |
| By default the classifier input will be the the final memory state of the MAC network. | |
| If outQuestion is True, concatenate the question representation to that. | |
| If outImage is True, concatenate the image flattened representation. | |
| Args: | |
| memory: (final) memory state of the MAC network. | |
| [batchSize, memDim] | |
| vecQuestions: question vector representation. | |
| [batchSize, ctrlDim] | |
| images: image features. | |
| [batchSize, H, W, imageInDim] | |
| imageInDim: images dimension. | |
| Returns the resulted features and their dimension. | |
| ''' | |
| def outputOp(self, memory, vecQuestions, images, imageInDim): | |
| with tf.variable_scope("outputUnit"): | |
| features = memory | |
| dim = config.memDim | |
| if config.outQuestion: | |
| eVecQuestions = ops.linear(vecQuestions, config.ctrlDim, config.memDim, name = "outQuestion") | |
| features, dim = ops.concat(features, eVecQuestions, config.memDim, mul = config.outQuestionMul) | |
| if config.outImage: | |
| images, imagesDim = ops.linearizeFeatures(images, self.H, self.W, self.imageInDim, | |
| outputDim = config.outImageDim) | |
| images = ops.linear(images, config.memDim, config.outImageDim, name = "outImage") | |
| features = tf.concat([features, images], axis = -1) | |
| dim += config.outImageDim | |
| return features, dim | |
| ''' | |
| Output Unit (step 2): Computes the logits for the answers. Passes the features | |
| through fully-connected network to get the logits over the possible answers. | |
| Optionally uses answer word embeddings in computing the logits (by default, it doesn't). | |
| Args: | |
| features: features used to compute logits | |
| [batchSize, inDim] | |
| inDim: features dimension | |
| aEmbedding: supported word embeddings for answer words in case answerMod is not NON. | |
| Optionally computes logits by computing dot-product with answer embeddings. | |
| Returns: the computed logits. | |
| [batchSize, answerWordsNum] | |
| ''' | |
| def classifier(self, features, inDim, aEmbeddings = None): | |
| with tf.variable_scope("classifier"): | |
| outDim = config.answerWordsNum | |
| dims = [inDim] + config.outClassifierDims + [outDim] | |
| if config.answerMod != "NON": | |
| dims[-1] = config.wrdEmbDim | |
| logits = ops.FCLayer(features, dims, | |
| batchNorm = self.batchNorm if config.outputBN else None, | |
| dropout = self.dropouts["output"]) | |
| if config.answerMod != "NON": | |
| logits = tf.nn.dropout(logits, self.dropouts["output"]) | |
| interactions = ops.mul(aEmbeddings, logits, dims[-1], interMod = config.answerMod) | |
| logits = ops.inter2logits(interactions, dims[-1], sumMod = "SUM") | |
| logits += ops.getBias((outputDim, ), "ans") | |
| # answersWeights = tf.transpose(aEmbeddings) | |
| # if config.answerMod == "BL": | |
| # Wans = ops.getWeight((dims[-1], config.wrdEmbDim), "ans") | |
| # logits = tf.matmul(logits, Wans) | |
| # elif config.answerMod == "DIAG": | |
| # Wans = ops.getWeight((config.wrdEmbDim, ), "ans") | |
| # logits = logits * Wans | |
| # logits = tf.matmul(logits, answersWeights) | |
| return logits | |
| # def getTemp(): | |
| # with tf.variable_scope("temperature"): | |
| # if config.tempParametric: | |
| # self.temperatureVar = tf.get_variable("temperature", shape = (), | |
| # initializer = tf.constant_initializer(5), dtype = tf.float32) | |
| # temperature = tf.sigmoid(self.temperatureVar) | |
| # else: | |
| # temperature = config.temperature | |
| # if config.tempDynamic: | |
| # temperature *= self.tempAnnealRate | |
| # return temperature | |
| # Computes mean cross entropy loss between logits and answers. | |
| def addAnswerLossOp(self, logits, answers): | |
| with tf.variable_scope("answerLoss"): | |
| losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = answers, logits = logits) | |
| loss = tf.reduce_mean(losses) | |
| self.answerLossList.append(loss) | |
| return loss, losses | |
| # Computes predictions (by finding maximal logit value, corresponding to highest probability) | |
| # and mean accuracy between predictions and answers. | |
| def addPredOp(self, logits, answers): | |
| with tf.variable_scope("pred"): | |
| preds = tf.to_int32(tf.argmax(logits, axis = -1)) # tf.nn.softmax( | |
| corrects = tf.equal(preds, answers) | |
| correctNum = tf.reduce_sum(tf.to_int32(corrects)) | |
| acc = tf.reduce_mean(tf.to_float(corrects)) | |
| self.correctNumList.append(correctNum) | |
| self.answerAccList.append(acc) | |
| return preds, corrects, correctNum | |
| # Creates optimizer (adam) | |
| def addOptimizerOp(self): | |
| with tf.variable_scope("trainAddOptimizer"): | |
| self.globalStep = tf.Variable(0, dtype = tf.int32, trainable = False, name = "globalStep") # init to 0 every run? | |
| optimizer = tf.train.AdamOptimizer(learning_rate = self.lr) | |
| return optimizer | |
| ''' | |
| Computes gradients for all variables or subset of them, based on provided loss, | |
| using optimizer. | |
| ''' | |
| def computeGradients(self, optimizer, loss, trainableVars = None): # tf.trainable_variables() | |
| with tf.variable_scope("computeGradients"): | |
| if config.trainSubset: | |
| trainableVars = [] | |
| allVars = tf.trainable_variables() | |
| for var in allVars: | |
| if any((s in var.name) for s in config.varSubset): | |
| trainableVars.append(var) | |
| gradients_vars = optimizer.compute_gradients(loss, trainableVars) | |
| return gradients_vars | |
| ''' | |
| Apply gradients. Optionally clip them, and update exponential moving averages | |
| for parameters. | |
| ''' | |
| def addTrainingOp(self, optimizer, gradients_vars): | |
| with tf.variable_scope("train"): | |
| gradients, variables = zip(*gradients_vars) | |
| norm = tf.global_norm(gradients) | |
| # gradient clipping | |
| if config.clipGradients: | |
| clippedGradients, _ = tf.clip_by_global_norm(gradients, config.gradMaxNorm, use_norm = norm) | |
| gradients_vars = zip(clippedGradients, variables) | |
| # updates ops (for batch norm) and train op | |
| updateOps = tf.get_collection(tf.GraphKeys.UPDATE_OPS) | |
| with tf.control_dependencies(updateOps): | |
| train = optimizer.apply_gradients(gradients_vars, global_step = self.globalStep) | |
| # exponential moving average | |
| if config.useEMA: | |
| ema = tf.train.ExponentialMovingAverage(decay = config.emaDecayRate) | |
| maintainAveragesOp = ema.apply(tf.trainable_variables()) | |
| with tf.control_dependencies([train]): | |
| trainAndUpdateOp = tf.group(maintainAveragesOp) | |
| train = trainAndUpdateOp | |
| self.emaDict = ema.variables_to_restore() | |
| return train, norm | |
| # TODO (add back support for multi-gpu..) | |
| def averageAcrossTowers(self, gpusNum): | |
| self.lossAll = self.lossList[0] | |
| self.answerLossAll = self.answerLossList[0] | |
| self.correctNumAll = self.correctNumList[0] | |
| self.answerAccAll = self.answerAccList[0] | |
| self.predsAll = self.predsList[0] | |
| self.gradientVarsAll = self.gradientVarsList[0] | |
| def trim2DVectors(self, vectors, vectorsLengths): | |
| maxLength = np.max(vectorsLengths) | |
| return vectors[:,:maxLength] | |
| def trimData(self, data): | |
| data["question"] = self.trim2DVectors(data["question"], data["questionLength"]) | |
| return data | |
| ''' | |
| Builds predictions JSON, by adding the model's predictions and attention maps | |
| back to the original data JSON. | |
| ''' | |
| def buildPredsList(self, prediction): | |
| return self.answerDict.decodeId(prediction) | |
| ''' | |
| Processes a batch of data with the model. | |
| Args: | |
| sess: TF session | |
| data: Data batch. Dictionary that contains numpy array for: | |
| questions, questionLengths, answers. | |
| See preprocess.py for further information of the batch structure. | |
| images: batch of image features, as numpy array. images["images"] contains | |
| [batchSize, channels, h, w] | |
| train: True to run batch for training. | |
| getAtt: True to return attention maps for question and image (and optionally | |
| self-attention and gate values). | |
| Returns results: e.g. loss, accuracy, running time. | |
| ''' | |
| def runBatch(self, sess, data, images, train, getAtt = False): | |
| data = self.trimData(data) | |
| predsOp = self.predsAll | |
| time0 = time.time() | |
| feed = self.createFeedDict(data, images, train) | |
| time1 = time.time() | |
| predsInfo = sess.run( | |
| predsOp, | |
| feed_dict = feed) | |
| time2 = time.time() | |
| predsList = self.buildPredsList(predsInfo[0]) | |
| return predsList | |
| def build(self): | |
| self.addPlaceholders() | |
| self.optimizer = self.addOptimizerOp() | |
| self.gradientVarsList = [] | |
| self.lossList = [] | |
| self.answerLossList = [] | |
| self.correctNumList = [] | |
| self.answerAccList = [] | |
| self.predsList = [] | |
| with tf.variable_scope("macModel"): | |
| for i in range(config.gpusNum): | |
| with tf.device("/gpu:{}".format(i)): | |
| with tf.name_scope("tower{}".format(i)) as scope: | |
| self.initTowerBatch(i, config.gpusNum, self.batchSizeAll) | |
| self.loss = tf.constant(0.0) | |
| # embed questions words (and optionally answer words) | |
| questionWords, qEmbeddings, aEmbeddings = \ | |
| self.embeddingsOp(self.questionsIndices, self.embeddingsInit) | |
| projWords = projQuestion = ((config.encDim != config.ctrlDim) or config.encProj) | |
| questionCntxWords, vecQuestions = self.encoder(questionWords, | |
| self.questionLengths, projWords, projQuestion, config.ctrlDim) | |
| # Image Input Unit (stem) | |
| imageFeatures = self.stem(self.images, self.imageInDim, config.memDim) | |
| # baseline model | |
| if config.useBaseline: | |
| output, dim = self.baseline(vecQuestions, config.ctrlDim, | |
| self.images, self.imageInDim, config.attDim) | |
| # MAC model | |
| else: | |
| # self.temperature = self.getTemp() | |
| finalControl, finalMemory = self.MACnetwork(imageFeatures, vecQuestions, | |
| questionWords, questionCntxWords, self.questionLengths) | |
| # Output Unit - step 1 (preparing classifier inputs) | |
| output, dim = self.outputOp(finalMemory, vecQuestions, | |
| self.images, self.imageInDim) | |
| # Output Unit - step 2 (classifier) | |
| logits = self.classifier(output, dim, aEmbeddings) | |
| # compute loss, predictions, accuracy | |
| answerLoss, self.losses = self.addAnswerLossOp(logits, self.answersIndices) | |
| self.preds, self.corrects, self.correctNum = self.addPredOp(logits, self.answersIndices) | |
| self.loss += answerLoss | |
| self.predsList.append(self.preds) | |
| self.lossList.append(self.loss) | |
| # compute gradients | |
| gradient_vars = self.computeGradients(self.optimizer, self.loss, trainableVars = None) | |
| self.gradientVarsList.append(gradient_vars) | |
| # reuse variables in next towers | |
| tf.get_variable_scope().reuse_variables() | |
| self.averageAcrossTowers(config.gpusNum) | |
| self.trainOp, self.gradNorm = self.addTrainingOp(self.optimizer, self.gradientVarsAll) | |
| self.noOp = tf.no_op() | |