Update app.py
Browse files
app.py
CHANGED
|
@@ -1,84 +1,94 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
# Import libraries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
-
import tensorflow as tf
|
| 5 |
-
from tensorflow import keras
|
| 6 |
-
from tensorflow.keras import layers
|
| 7 |
-
|
| 8 |
-
# Load the text data
|
| 9 |
-
text = open('shakespeare.txt', 'r').read() # Read the text file
|
| 10 |
-
vocab = sorted(set(text)) # Get the unique characters in the text
|
| 11 |
-
char2idx = {c: i for i, c in enumerate(vocab)} # Map characters to indices
|
| 12 |
-
idx2char = np.array(vocab) # Map indices to characters
|
| 13 |
-
text_as_int = np.array([char2idx[c] for c in text]) # Convert text to integers
|
| 14 |
-
|
| 15 |
-
# Create training examples and targets
|
| 16 |
-
seq_length = 100 # Length of the input sequence
|
| 17 |
-
examples_per_epoch = len(text) // (seq_length + 1) # Number of examples per epoch
|
| 18 |
-
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # Create a dataset from the text
|
| 19 |
-
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True) # Create batches of sequences
|
| 20 |
-
|
| 21 |
-
def split_input_target(chunk): # Define a function to split the input and target
|
| 22 |
-
input_text = chunk[:-1] # Input is the sequence except the last character
|
| 23 |
-
target_text = chunk[1:] # Target is the sequence except the first character
|
| 24 |
-
return input_text, target_text
|
| 25 |
-
|
| 26 |
-
dataset = sequences.map(split_input_target) # Apply the function to the dataset
|
| 27 |
-
|
| 28 |
-
# Shuffle and batch the dataset
|
| 29 |
-
BATCH_SIZE = 1 # Batch size
|
| 30 |
-
BUFFER_SIZE = 10000 # Buffer size for shuffling
|
| 31 |
-
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True) # Shuffle and batch the dataset
|
| 32 |
-
|
| 33 |
-
# Define the model
|
| 34 |
-
vocab_size = len(vocab) # Size of the vocabulary
|
| 35 |
-
embedding_dim = 256 # Dimension of the embedding layer
|
| 36 |
-
rnn_units = 1024 # Number of units in the RNN layer
|
| 37 |
-
|
| 38 |
-
model = keras.Sequential([
|
| 39 |
-
layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]), # Embedding layer
|
| 40 |
-
layers.GRU(rnn_units, return_sequences=True, stateful=True), # GRU layer
|
| 41 |
-
layers.Dense(vocab_size) # Dense layer with vocab_size units
|
| 42 |
-
])
|
| 43 |
-
|
| 44 |
-
# Define the loss function
|
| 45 |
-
def loss(labels, logits):
|
| 46 |
-
return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
|
| 47 |
-
|
| 48 |
-
# Compile the model
|
| 49 |
-
model.compile(optimizer='adam', loss=loss)
|
| 50 |
-
|
| 51 |
-
# Define a function to generate text
|
| 52 |
-
def generate_text(model, start_string):
|
| 53 |
-
num_generate = 50 # Number of characters to generate
|
| 54 |
-
input_eval = [char2idx[s] for s in start_string] # Convert the start string to numbers
|
| 55 |
-
input_eval = tf.expand_dims(input_eval, 0) # Expand the dimension for batch size
|
| 56 |
-
text_generated = [] # Empty list to store the generated text
|
| 57 |
-
|
| 58 |
-
temperature = 1.0 # Temperature parameter to control the randomness
|
| 59 |
-
|
| 60 |
-
model.reset_states() # Reset the states of the model
|
| 61 |
-
|
| 62 |
-
for i in range(num_generate): # Loop over the number of characters to generate
|
| 63 |
-
predictions = model(input_eval) # Get the predictions from the model
|
| 64 |
-
predictions = tf.squeeze(predictions, 0) # Remove the batch dimension
|
| 65 |
-
|
| 66 |
-
predictions = predictions / temperature # Divide by temperature to increase or decrease randomness
|
| 67 |
-
predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy() # Sample from the predictions
|
| 68 |
-
|
| 69 |
-
input_eval = tf.expand_dims([predicted_id], 0) # Update the input with the predicted id
|
| 70 |
-
|
| 71 |
-
text_generated.append(idx2char[predicted_id]) # Append the predicted character to the generated text
|
| 72 |
-
|
| 73 |
-
return (start_string + ''.join(text_generated)) # Return the start string and the generated text
|
| 74 |
-
|
| 75 |
-
# Train the model
|
| 76 |
-
EPOCHS = 1 # Number of epochs to train
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Import libraries
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
+
import transformers
|
| 6 |
+
import librosa
|
| 7 |
+
import cv2
|
| 8 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
# Load models
|
| 11 |
+
text_model = transformers.pipeline("text-generation")
|
| 12 |
+
audio_model = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
| 13 |
+
audio_tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
| 14 |
+
image_model = transformers.pipeline("image-classification")
|
| 15 |
+
video_model = transformers.VideoClassificationPipeline(model="facebook/mmf-vit-base-16", feature_extractor="facebook/mmf-vit-base-16")
|
| 16 |
+
|
| 17 |
+
# Define functions for processing inputs and outputs
|
| 18 |
+
def text_to_text(input):
|
| 19 |
+
output = text_model(input, max_length=50)
|
| 20 |
+
return output[0]["generated_text"]
|
| 21 |
+
|
| 22 |
+
def text_to_audio(input):
|
| 23 |
+
output = text_model(input, max_length=50)
|
| 24 |
+
output = gr.outputs.Audio.from_str(output[0]["generated_text"])
|
| 25 |
+
return output
|
| 26 |
+
|
| 27 |
+
def text_to_image(input):
|
| 28 |
+
output = text_model(input, max_length=50)
|
| 29 |
+
output = gr.outputs.Image.from_str(output[0]["generated_text"])
|
| 30 |
+
return output
|
| 31 |
+
|
| 32 |
+
def text_to_video(input):
|
| 33 |
+
output = text_model(input, max_length=50)
|
| 34 |
+
output = gr.outputs.Video.from_str(output[0]["generated_text"])
|
| 35 |
+
return output
|
| 36 |
+
|
| 37 |
+
def audio_to_text(input):
|
| 38 |
+
input = librosa.load(input)[0]
|
| 39 |
+
input = torch.from_numpy(input).unsqueeze(0)
|
| 40 |
+
logits = audio_model(input).logits
|
| 41 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 42 |
+
output = audio_tokenizer.batch_decode(predicted_ids)[0]
|
| 43 |
+
return output
|
| 44 |
+
|
| 45 |
+
def audio_to_audio(input):
|
| 46 |
+
return input
|
| 47 |
+
|
| 48 |
+
def audio_to_image(input):
|
| 49 |
+
input = librosa.load(input)[0]
|
| 50 |
+
input = torch.from_numpy(input).unsqueeze(0)
|
| 51 |
+
logits = audio_model(input).logits
|
| 52 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 53 |
+
output = audio_tokenizer.batch_decode(predicted_ids)[0]
|
| 54 |
+
output = gr.outputs.Image.from_str(output)
|
| 55 |
+
return output
|
| 56 |
+
|
| 57 |
+
def audio_to_video(input):
|
| 58 |
+
input = librosa.load(input)[0]
|
| 59 |
+
input = torch.from_numpy(input).unsqueeze(0)
|
| 60 |
+
logits = audio_model(input).logits
|
| 61 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 62 |
+
output = audio_tokenizer.batch_decode(predicted_ids)[0]
|
| 63 |
+
output = gr.outputs.Video.from_str(output)
|
| 64 |
+
return output
|
| 65 |
+
|
| 66 |
+
def image_to_text(input):
|
| 67 |
+
input = cv2.imread(input)
|
| 68 |
+
input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
|
| 69 |
+
input = np.expand_dims(input, axis=0)
|
| 70 |
+
output = image_model(input)
|
| 71 |
+
return output[0]["label"]
|
| 72 |
+
|
| 73 |
+
def image_to_audio(input):
|
| 74 |
+
input = cv2.imread(input)
|
| 75 |
+
input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
|
| 76 |
+
input = np.expand_dims(input, axis=0)
|
| 77 |
+
output = image_model(input)
|
| 78 |
+
output = gr.outputs.Audio.from_str(output[0]["label"])
|
| 79 |
+
return output
|
| 80 |
+
|
| 81 |
+
def image_to_image(input):
|
| 82 |
+
return input
|
| 83 |
+
|
| 84 |
+
def image_to_video(input):
|
| 85 |
+
input = cv2.imread(input)
|
| 86 |
+
input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
|
| 87 |
+
input = np.expand_dims(input, axis=0)
|
| 88 |
+
output = image_model(input)
|
| 89 |
+
output = gr.outputs.Video.from_str(output[0]["label"])
|
| 90 |
+
return output
|
| 91 |
+
|
| 92 |
+
def video_to_text(input):
|
| 93 |
+
input = cv2.VideoCapture(input)
|
| 94 |
+
frames = []
|