Spaces:

CVPR
/

v-doc_abstractive_mac

Runtime error

App Files Files Community

ydin0771 commited on Jun 13, 2022

Commit

ba880ef

1 Parent(s): 22774f1

Upload preprocess.py

Browse files

Files changed (1) hide show

preprocess.py +551 -0

preprocess.py ADDED Viewed

	@@ -0,0 +1,551 @@

+import time
+import os
+import random
+import json
+import pickle
+import numpy as np
+from tqdm import tqdm
+from termcolor import colored
+from program_translator import ProgramTranslator  #
+from config import config
+# Print bold tex
+def bold(txt):
+    return colored(str(txt), attrs=["bold"])
+# Print bold and colored text
+def bcolored(txt, color):
+    return colored(str(txt), color, attrs=["bold"])
+# Write a line to file
+def writeline(f, line):
+    f.write(str(line) + "\n")
+# Write a list to file
+def writelist(f, l):
+    writeline(f, ",".join(map(str, l)))
+# 2d list to numpy
+def vectorize2DList(items, minX=0, minY=0, dtype=np.int):
+    maxX = max(len(items), minX)
+    maxY = max([len(item) for item in items] + [minY])
+    t = np.zeros((maxX, maxY), dtype=dtype)
+    tLengths = np.zeros((maxX,), dtype=np.int)
+    for i, item in enumerate(items):
+        t[i, 0:len(item)] = np.array(item, dtype=dtype)
+        tLengths[i] = len(item)
+    return t, tLengths
+# 3d list to numpy
+def vectorize3DList(items, minX=0, minY=0, minZ=0, dtype=np.int):
+    maxX = max(len(items), minX)
+    maxY = max([len(item) for item in items] + [minY])
+    maxZ = max([len(subitem) for item in items for subitem in item] + [minZ])
+    t = np.zeros((maxX, maxY, maxZ), dtype=dtype)
+    tLengths = np.zeros((maxX, maxY), dtype=np.int)
+    for i, item in enumerate(items):
+        for j, subitem in enumerate(item):
+            t[i, j, 0:len(subitem)] = np.array(subitem, dtype=dtype)
+            tLengths[i, j] = len(subitem)
+    return t, tLengths
+'''
+Encodes text into integers. Keeps dictionary between string words (symbols)
+and their matching integers. Supports encoding and decoding.
+'''
+class SymbolDict(object):
+    def __init__(self, empty=False):
+        self.padding = "<PAD>"
+        self.unknown = "<UNK>"
+        self.start = "<START>"
+        self.end = "<END>"
+        self.invalidSymbols = [self.padding, self.unknown, self.start, self.end]
+        if empty:
+            self.sym2id = {}
+            self.id2sym = []
+        else:
+            self.sym2id = {self.padding: 0, self.unknown: 1, self.start: 2, self.end: 3}
+            self.id2sym = [self.padding, self.unknown, self.start, self.end]
+        self.allSeqs = []
+    def getNumSymbols(self):
+        return len(self.sym2id)
+    def isPadding(self, enc):
+        return enc == 0
+    def isUnknown(self, enc):
+        return enc == 1
+    def isStart(self, enc):
+        return enc == 2
+    def isEnd(self, enc):
+        return enc == 3
+    def isValid(self, enc):
+        return enc < self.getNumSymbols() and enc >= len(self.invalidSymbols)
+    def resetSeqs(self):
+        self.allSeqs = []
+    def addSeq(self, seq):
+        self.allSeqs += seq
+    # Call to create the words-to-integers vocabulary after (reading word sequences with addSeq).
+    def createVocab(self, minCount=0):
+        counter = {}
+        for symbol in self.allSeqs:
+            counter[symbol] = counter.get(symbol, 0) + 1
+        for symbol in counter:
+            if counter[symbol] > minCount and (symbol not in self.sym2id):
+                self.sym2id[symbol] = self.getNumSymbols()
+                self.id2sym.append(symbol)
+    # Encodes a symbol. Returns the matching integer.
+    def encodeSym(self, symbol):
+        if symbol not in self.sym2id:
+            symbol = self.unknown
+        return self.sym2id[symbol]
+    '''
+    Encodes a sequence of symbols.
+    Optionally add start, or end symbols.
+    Optionally reverse sequence
+    '''
+    def encodeSequence(self, decoded, addStart=False, addEnd=False, reverse=False):
+        if reverse:
+            decoded.reverse()
+        if addStart:
+            decoded = [self.start] + decoded
+        if addEnd:
+            decoded = decoded + [self.end]
+        encoded = [self.encodeSym(symbol) for symbol in decoded]
+        return encoded
+    # Decodes an integer into its symbol
+    def decodeId(self, enc):
+        return self.id2sym[enc] if enc < self.getNumSymbols() else self.unknown
+    '''
+    Decodes a sequence of integers into their symbols.
+    If delim is given, joins the symbols using delim,
+    Optionally reverse the resulted sequence
+    '''
+    def decodeSequence(self, encoded, delim=None, reverse=False, stopAtInvalid=True):
+        length = 0
+        for i in range(len(encoded)):
+            if not self.isValid(encoded[i]) and stopAtInvalid:
+                break
+            length += 1
+        encoded = encoded[:length]
+        decoded = [self.decodeId(enc) for enc in encoded]
+        if reverse:
+            decoded.reverse()
+        if delim is not None:
+            return delim.join(decoded)
+        return decoded
+'''
+Preprocesses a given dataset into numpy arrays.
+By calling preprocess, the class:
+1. Reads the input data files into dictionary.
+2. Saves the results jsons in files and loads them instead of parsing input if files exist/
+3. Initializes word embeddings to random / GloVe.
+4. Optionally filters data according to given filters.
+5. Encodes and vectorize the data into numpy arrays.
+6. Buckets the data according to the instances length.
+'''
+class Preprocesser(object):
+    def __init__(self):
+        self.questionDict = SymbolDict()
+        self.answerDict = SymbolDict(empty=True)
+        self.qaDict = SymbolDict()
+        self.specificDatasetDicts = None
+        self.programDict = SymbolDict()
+        self.programTranslator = ProgramTranslator(self.programDict, 2)
+    '''
+    Tokenizes string into list of symbols.
+    Args:
+        text: raw string to tokenize.
+        ignorePuncts: punctuation to ignore
+        keptPunct: punctuation to keep (as symbol)
+        endPunct: punctuation to remove if appears at the end
+        delim: delimiter between symbols
+        clean: True to replace text in string
+        replacelistPre: dictionary of replacement to perform on the text before tokanization
+        replacelistPost: dictionary of replacement to perform on the text after tokanization
+    '''
+    # sentence tokenizer
+    allPunct = ["?", "!", "\\", "/", ")", "(", ".", ",", ";", ":"]
+    def tokenize(self, text, ignoredPuncts=["?", "!", "\\", "/", ")", "("],
+                 keptPuncts=[".", ",", ";", ":"], endPunct=[">", "<", ":"], delim=" ",
+                 clean=False, replacelistPre=dict(), replacelistPost=dict()):
+        if clean:
+            for word in replacelistPre:
+                origText = text
+                text = text.replace(word, replacelistPre[word])
+                if (origText != text):
+                    print(origText)
+                    print(text)
+                    print("")
+            for punct in endPunct:
+                if text[-1] == punct:
+                    print(text)
+                    text = text[:-1]
+                    print(text)
+                    print("")
+        for punct in keptPuncts:
+            text = text.replace(punct, delim + punct + delim)
+        for punct in ignoredPuncts:
+            text = text.replace(punct, "")
+        ret = text.lower().split(delim)
+        if clean:
+            origRet = ret
+            ret = [replacelistPost.get(word, word) for word in ret]
+            if origRet != ret:
+                print(origRet)
+                print(ret)
+        ret = [t for t in ret if t != ""]
+        return ret
+    # Read class' generated files.
+    # files interface
+    def readFiles(self, instancesFilename):
+        with open(instancesFilename, "r") as inFile:
+            instances = json.load(inFile)
+        with open(config.questionDictFile(), "rb") as inFile:
+            self.questionDict = pickle.load(inFile)
+        with open(config.answerDictFile(), "rb") as inFile:
+            self.answerDict = pickle.load(inFile)
+        with open(config.qaDictFile(), "rb") as inFile:
+            self.qaDict = pickle.load(inFile)
+        return instances
+    '''
+    Generate class' files. Save json representation of instances and
+    symbols-to-integers dictionaries.
+    '''
+    def writeFiles(self, instances, instancesFilename):
+        with open(instancesFilename, "w") as outFile:
+            json.dump(instances, outFile)
+        with open(config.questionDictFile(), "wb") as outFile:
+            pickle.dump(self.questionDict, outFile)
+        with open(config.answerDictFile(), "wb") as outFile:
+            pickle.dump(self.answerDict, outFile)
+        with open(config.qaDictFile(), "wb") as outFile:
+            pickle.dump(self.qaDict, outFile)
+    # Write prediction json to file and optionally a one-answer-per-line output file
+    def writePreds(self, res, tier, suffix=""):
+        if res is None:
+            return
+        preds = res["preds"]
+        sortedPreds = sorted(preds, key=lambda instance: instance["index"])
+        with open(config.predsFile(tier + suffix), "w") as outFile:
+            outFile.write(json.dumps(sortedPreds))
+        with open(config.answersFile(tier + suffix), "w") as outFile:
+            for instance in sortedPreds:
+                writeline(outFile, instance["prediction"])
+    def readPDF(self, instancesFilename):
+        instances = []
+        if os.path.exists(instancesFilename):
+            instances = self.readFiles(instancesFilename)
+        return instances
+    def readData(self, datasetFilename, instancesFilename, train):
+        # data extraction
+        datasetReader = {
+            "PDF": self.readPDF
+        }
+        return datasetReader[config.dataset](datasetFilename, instancesFilename, train)
+    def vectorizeData(self, data):
+        # if "SHARED" tie symbol representations in questions and answers
+        if config.ansEmbMod == "SHARED":
+            qDict = self.qaDict
+        else:
+            qDict = self.questionDict
+        encodedQuestion = [qDict.encodeSequence(d["questionSeq"]) for d in data]
+        question, questionL = vectorize2DList(encodedQuestion)
+        # pass the whole instances? if heavy then not good
+        imageId = [d["imageId"] for d in data]
+        instance = data
+        return {"question": question,
+                "questionLength": questionL,
+                "imageId": imageId
+                }
+    # Separates data based on a field length
+    def lseparator(self, key, lims):
+        maxI = len(lims)
+        def separatorFn(x):
+            v = x[key]
+            for i, lim in enumerate(lims):
+                if len(v) < lim:
+                    return i
+            return maxI
+        return {"separate": separatorFn, "groupsNum": maxI + 1}
+    # Buckets data to groups using a separator
+    def bucket(self, instances, separator):
+        buckets = [[] for i in range(separator["groupsNum"])]
+        for instance in instances:
+            bucketI = separator["separate"](instance)
+            buckets[bucketI].append(instance)
+        return [bucket for bucket in buckets if len(bucket) > 0]
+    # Re-buckets bucket list given a seperator
+    def rebucket(self, buckets, separator):
+        res = []
+        for bucket in buckets:
+            res += self.bucket(bucket, separator)
+        return res
+    # Buckets data based on question / program length
+    def bucketData(self, data, noBucket=False):
+        if noBucket:
+            buckets = [data]
+        else:
+            if config.noBucket:
+                buckets = [data]
+            elif config.noRebucket:
+                questionSep = self.lseparator("questionSeq", config.questionLims)
+                buckets = self.bucket(data, questionSep)
+            else:
+                programSep = self.lseparator("programSeq", config.programLims)
+                questionSep = self.lseparator("questionSeq", config.questionLims)
+                buckets = self.bucket(data, programSep)
+                buckets = self.rebucket(buckets, questionSep)
+        return buckets
+    '''
+    Prepares data:
+    1. Filters data according to above arguments.
+    2. Takes only a subset of the data based on config.trainedNum / config.testedNum
+    3. Buckets data according to question / program length
+    4. Vectorizes data into numpy arrays
+    '''
+    def prepareData(self, data, train, filterKey=None, noBucket=False):
+        filterDefault = {"maxQLength": 0, "maxPLength": 0, "onlyChain": False, "filterOp": 0}
+        filterTrain = {"maxQLength": config.tMaxQ, "maxPLength": config.tMaxP,
+                       "onlyChain": config.tOnlyChain, "filterOp": config.tFilterOp}
+        filterVal = {"maxQLength": config.vMaxQ, "maxPLength": config.vMaxP,
+                     "onlyChain": config.vOnlyChain, "filterOp": config.vFilterOp}
+        filters = {"train": filterTrain, "evalTrain": filterTrain,
+                   "val": filterVal, "test": filterDefault}
+        if filterKey is None:
+            fltr = filterDefault
+        else:
+            fltr = filters[filterKey]
+        # split data when finetuning on validation set
+        if config.trainExtra and config.extraVal and (config.finetuneNum > 0):
+            if train:
+                data = data[:config.finetuneNum]
+            else:
+                data = data[config.finetuneNum:]
+        typeFilter = config.typeFilters[fltr["filterOp"]]
+        # filter specific settings
+        if fltr["onlyChain"]:
+            data = [d for d in data if all((len(inputNum) < 2) for inputNum in d["programInputs"])]
+        if fltr["maxQLength"] > 0:
+            data = [d for d in data if len(d["questionSeq"]) <= fltr["maxQLength"]]
+        if fltr["maxPLength"] > 0:
+            data = [d for d in data if len(d["programSeq"]) <= fltr["maxPLength"]]
+        if len(typeFilter) > 0:
+            data = [d for d in data if d["programSeq"][-1] not in typeFilter]
+        # run on subset of the data. If 0 then use all data
+        num = config.trainedNum if train else config.testedNum
+        # retainVal = True to retain same clevr_sample of validation across runs
+        if (not train) and (not config.retainVal):
+            random.shuffle(data)
+        if num > 0:
+            data = data[:num]
+        # set number to match dataset size
+        if train:
+            config.trainedNum = len(data)
+        else:
+            config.testedNum = len(data)
+        # bucket
+        buckets = self.bucketData(data, noBucket=noBucket)
+        # vectorize
+        return [self.vectorizeData(bucket) for bucket in buckets]
+    # Prepares all the tiers of a dataset. See prepareData method for further details.
+    def prepareDataset(self, dataset, noBucket=False):
+        if dataset is None:
+            return None
+        for tier in dataset:
+            if dataset[tier] is not None:
+                dataset[tier]["data"] = self.prepareData(dataset[tier]["instances"],
+                                                         train=dataset[tier]["train"], filterKey=tier,
+                                                         noBucket=noBucket)
+        for tier in dataset:
+            if dataset[tier] is not None:
+                del dataset[tier]["instances"]
+        return dataset
+    # Initializes word embeddings to random uniform / random normal / GloVe.
+    def initializeWordEmbeddings(self, wordsDict=None, noPadding=False):
+        # default dictionary to use for embeddings
+        if wordsDict is None:
+            wordsDict = self.questionDict
+        # uniform initialization
+        if config.wrdEmbUniform:
+            lowInit = -1.0 * config.wrdEmbScale
+            highInit = 1.0 * config.wrdEmbScale
+            embeddings = np.random.uniform(low=lowInit, high=highInit,
+                                           size=(wordsDict.getNumSymbols(), config.wrdEmbDim))
+        # normal initialization
+        else:
+            embeddings = config.wrdEmbScale * np.random.randn(wordsDict.getNumSymbols(),
+                                                              config.wrdEmbDim)
+        # if wrdEmbRandom = False, use GloVE
+        counter = 0
+        if (not config.wrdEmbRandom):
+            with open(config.wordVectorsFile, 'r') as inFile:
+                for line in inFile:
+                    line = line.strip().split()
+                    word = line[0].lower()
+                    vector = [float(x) for x in line[1:]]
+                    index = wordsDict.sym2id.get(word)
+                    if index is not None:
+                        embeddings[index] = vector
+                        counter += 1
+        print(counter)
+        print(self.questionDict.sym2id)
+        print(len(self.questionDict.sym2id))
+        print(self.answerDict.sym2id)
+        print(len(self.answerDict.sym2id))
+        print(self.qaDict.sym2id)
+        print(len(self.qaDict.sym2id))
+        if noPadding:
+            return embeddings  # no embedding for padding symbol
+        else:
+            return embeddings[1:]
+    '''
+    Initializes words embeddings for question words and optionally for answer words
+    (when config.ansEmbMod == "BOTH"). If config.ansEmbMod == "SHARED", tie embeddings for
+    question and answer same symbols.
+    '''
+    def initializeQAEmbeddings(self):
+        # use same embeddings for questions and answers
+        if config.ansEmbMod == "SHARED":
+            qaEmbeddings = self.initializeWordEmbeddings(self.qaDict)
+            ansMap = np.array([self.qaDict.sym2id[sym] for sym in self.answerDict.id2sym])
+            embeddings = {"qa": qaEmbeddings, "ansMap": ansMap}
+        # use different embeddings for questions and answers
+        else:
+            qEmbeddings = self.initializeWordEmbeddings(self.questionDict)
+            aEmbeddings = None
+            if config.ansEmbMod == "BOTH":
+                aEmbeddings = self.initializeWordEmbeddings(self.answerDict, noPadding=True)
+            embeddings = {"q": qEmbeddings, "a": aEmbeddings}
+        return embeddings
+    '''
+    Preprocesses a given dataset into numpy arrays:
+    1. Reads the input data files into dictionary.
+    2. Saves the results jsons in files and loads them instead of parsing input if files exist/
+    3. Initializes word embeddings to random / GloVe.
+    4. Optionally filters data according to given filters.
+    5. Encodes and vectorize the data into numpy arrays.
+    5. Buckets the data according to the instances length.
+    '''
+    def preprocessData(self, question, debug=False):
+        # Read data into json and symbols' dictionaries
+        print(bold("Loading data..."))
+        start = time.time()
+        with open(config.questionDictFile(), "rb") as inFile:
+            self.questionDict = pickle.load(inFile)
+        with open(config.qaDictFile(), "rb") as inFile:
+            self.qaDict = pickle.load(inFile)
+        with open(config.answerDictFile(), "rb") as inFile:
+            self.answerDict = pickle.load(inFile)
+        question = question.replace('?', '').replace(', ', '').lower().split()
+        encodedQuestion = self.questionDict.encodeSequence(question)
+        data = {'question': np.array([encodedQuestion]), 'questionLength': np.array([len(encodedQuestion)])}
+        print("took {:.2f} seconds".format(time.time() - start))
+        # Initialize word embeddings (random / glove)
+        print(bold("Loading word vectors..."))
+        start = time.time()
+        embeddings = self.initializeQAEmbeddings()
+        print("took {:.2f} seconds".format(time.time() - start))
+        answer = 'yes'  # DUMMY_ANSWER
+        self.answerDict.addSeq([answer])
+        self.qaDict.addSeq([answer])
+        config.questionWordsNum = self.questionDict.getNumSymbols()
+        config.answerWordsNum = self.answerDict.getNumSymbols()
+        return data, embeddings, self.answerDict