#!/usr/bin/env python # -*- coding: utf-8 -*- from tensorflow import data, TensorShape, int64, int32 from math import exp from os import makedirs from shutil import rmtree, move, copytree from huggingface_hub import hf_hub_download import os def get_features(tokenizer, sentences, labels): features = [] for i, sentence in enumerate(sentences): inputs = tokenizer.encode_plus( sentence, add_special_tokens=True, max_length=tokenizer.model_max_length ) input_ids, token_type_ids = \ inputs['input_ids'], inputs['token_type_ids'] padding_length = tokenizer.model_max_length - len(input_ids) if tokenizer.padding_side == 'right': attention_mask = [1] * len(input_ids) + [0] * padding_length input_ids = input_ids + [tokenizer.pad_token_id] * padding_length token_type_ids = token_type_ids + \ [tokenizer.pad_token_type_id] * padding_length else: attention_mask = [0] * padding_length + [1] * len(input_ids) input_ids = [tokenizer.pad_token_id] * padding_length + input_ids token_type_ids = \ [tokenizer.pad_token_type_id] * padding_length + token_type_ids assert tokenizer.model_max_length \ == len(attention_mask) \ == len(input_ids) \ == len(token_type_ids) feature = { 'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids, 'label': int(labels[i]) } features.append(feature) def gen(): for feature in features: yield ( { 'input_ids': feature['input_ids'], 'attention_mask': feature['attention_mask'], 'token_type_ids': feature['token_type_ids'], }, feature['label'], ) dataset = data.Dataset.from_generator( gen, ({ 'input_ids': int32, 'attention_mask': int32, 'token_type_ids': int32 }, int64), ( { 'input_ids': TensorShape([None]), 'attention_mask': TensorShape([None]), 'token_type_ids': TensorShape([None]), }, TensorShape([]), ), ) return dataset def softmax(values): exps = [exp(value) for value in values] exps_sum = sum(exp_value for exp_value in exps) return tuple(map(lambda x: x / exps_sum, exps)) def make_dir(path): try: makedirs(path) except FileExistsError: pass def remove_dir(path): rmtree(path) def copy_dir(source_path, target_path): copytree(source_path, target_path) def move_dir(source_path, target_path): move(source_path, target_path) def download_from_hub(repo_id, filename, revision=None, cache_dir=None): try: hf_hub_download(repo_id=repo_id, filename=filename, revision=revision, cache_dir=cache_dir) except Exception as exp: raise exp if cache_dir is not None: files = os.listdir(cache_dir) for f in files: if '.lock' in f: name = f[0:-5] os.rename(cache_dir+name, cache_dir+filename) os.remove(cache_dir+name+'.lock') os.remove(cache_dir+name+'.json')