satyamr196 commited on
Commit
30b7603
Β·
1 Parent(s): f82f487

added generate transcript fxn

Browse files
Files changed (3) hide show
  1. ASR_Server.py +172 -0
  2. requirements.txt +15 -0
  3. test.csv +0 -0
ASR_Server.py CHANGED
@@ -1,4 +1,174 @@
1
  from flask import Flask, jsonify
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  app = Flask(__name__)
4
 
@@ -25,6 +195,8 @@ def asr_models():
25
  "Fairseq S2T",
26
  "ESPnet"
27
  ]
 
 
28
  return jsonify({"asr_models": models})
29
 
30
  # if __name__ == "__main__":
 
1
  from flask import Flask, jsonify
2
+ from datasets import load_dataset, Audio
3
+ import pandas as pd
4
+ import os
5
+
6
+ # Load dataset without decoding audio (required!)
7
+ dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
8
+ # dataset = dataset.with_format("python", decode_audio=False)
9
+ dataset = dataset.cast_column("audio", Audio(decode=False))
10
+
11
+ print(" ___ ")
12
+ csv_path = "test.csv"
13
+ df = pd.read_csv(csv_path)
14
+ print(f"CSV Loaded with {len(df)} rows")
15
+
16
+ # def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
17
+ # import os
18
+ # import time
19
+ # import pandas as pd
20
+ # import librosa
21
+ # import tqdm
22
+ # from transformers import pipeline
23
+
24
+ # os.makedirs(output_dir, exist_ok=True)
25
+ # # output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model.replace('/', '_')}.csv")
26
+ # output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv")
27
+
28
+ # if os.path.exists(output_csv_path):
29
+ # print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
30
+ # return
31
+
32
+ # # Load metadata CSV
33
+ # df = pd.read_csv(csv_path)
34
+ # print(f"CSV Loaded with {len(df)} rows")
35
+
36
+ # # Prepare
37
+ # df[df.columns[0]] = df[df.columns[0]].str.strip().str.lower()
38
+ # filename_column = df.columns[0]
39
+ # transcripts = []
40
+ # rtfx_score = []
41
+
42
+ # # Load ASR model
43
+ # pipe = pipeline("automatic-speech-recognition", model=ASR_model)
44
+
45
+ # # Create a map of dataset samples by file name (assumes filename is in dataset)
46
+ # dataset_map = {
47
+ # sample["audio"]["path"].split("/")[-1].lower(): sample for sample in dataset
48
+ # }
49
+
50
+ # for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
51
+ # filename = row[filename_column].strip().lower() + ".wav"
52
+ # if filename in dataset_map:
53
+ # sample = dataset_map[filename]
54
+ # try:
55
+ # audio_array = sample["audio"]["array"]
56
+ # sample_rate = sample["audio"]["sampling_rate"]
57
+
58
+ # start_time = time.time()
59
+ # result = pipe({"array": audio_array, "sampling_rate": sample_rate})
60
+ # end_time = time.time()
61
+
62
+ # transcript = result["text"]
63
+ # duration = librosa.get_duration(y=audio_array, sr=sample_rate)
64
+ # rtfx = (end_time - start_time) / duration if duration > 0 else 0
65
+
66
+ # transcripts.append(transcript)
67
+ # rtfx_score.append(rtfx)
68
+
69
+ # print(f"βœ… {filename}: RTFX = {rtfx:.2f}")
70
+
71
+ # except Exception as e:
72
+ # print(f"❌ Error with {filename}: {e}")
73
+ # transcripts.append("")
74
+ # rtfx_score.append(0)
75
+ # else:
76
+ # print(f"⚠️ File not in dataset: {filename}")
77
+ # transcripts.append("")
78
+ # rtfx_score.append(0)
79
+
80
+ # # Append to original DataFrame
81
+ # df['transcript'] = transcripts
82
+ # df['rtfx'] = rtfx_score
83
+
84
+ # df.to_csv(output_csv_path, index=False)
85
+ # print(f"βœ… Transcripts saved to {output_csv_path}")
86
+
87
+ def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
88
+ import os
89
+ import time
90
+ import tqdm
91
+ import pandas as pd
92
+ import soundfile as sf
93
+ from transformers import pipeline
94
+
95
+ output_csv_path = os.path.join("./", f"test_with_{ASR_model}.csv")
96
+ # Check if transcript already exists
97
+ if os.path.exists(output_csv_path):
98
+ print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
99
+ return
100
+
101
+ # Load CSV
102
+ df = pd.read_csv(csv_path)
103
+ print(f"CSV Loaded with {len(df)} rows")
104
+
105
+ # Initialize ASR pipeline
106
+ pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=-1)
107
+ print("Device set to use CPU")
108
+
109
+ # Column with filenames in the CSV
110
+ filename_column = df.columns[0]
111
+ df[filename_column] = df[filename_column].str.strip().str.lower()
112
+
113
+ # Build map from filename -> dataset sample (without decoding audio)
114
+ print("Creating dataset map from filenames...")
115
+ # dataset = dataset.with_format("python", decode_audio=False)
116
+ dataset_map = {
117
+ os.path.basename(sample["audio"]["path"]).lower(): sample
118
+ for sample in dataset
119
+ }
120
+
121
+ transcripts = []
122
+ rtfx_score = []
123
+
124
+ for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
125
+ filename = row[filename_column] + ".wav"
126
+
127
+ if filename in dataset_map:
128
+ sample = dataset_map[filename]
129
+ try:
130
+ # Decode audio only when needed
131
+ file_path = sample["audio"]["path"]
132
+ audio_array, sample_rate = sf.read(file_path)
133
+
134
+ start_time = time.time()
135
+ result = pipe({"array": audio_array, "sampling_rate": sample_rate})
136
+ end_time = time.time()
137
+
138
+ transcript = result["text"]
139
+ duration = len(audio_array) / sample_rate
140
+ rtfx = (end_time - start_time) / duration if duration > 0 else 0
141
+
142
+ transcripts.append(transcript)
143
+ rtfx_score.append(rtfx)
144
+
145
+ print(f"βœ… {filename}: RTFX = {rtfx:.2f}")
146
+
147
+ except Exception as e:
148
+ print(f"❌ Error with {filename}: {e}")
149
+ transcripts.append("")
150
+ rtfx_score.append(0)
151
+ else:
152
+ print(f"❌ File not found in dataset: {filename}")
153
+ transcripts.append("")
154
+ rtfx_score.append(0)
155
+
156
+ # Save results
157
+ df["transcript"] = transcripts
158
+ df["rtfx"] = rtfx_score
159
+
160
+ os.makedirs(output_dir, exist_ok=True)
161
+ # Create the directory if it doesn't exist
162
+ output_dir = os.path.dirname(os.path.join(output_dir, f"test_with_{ASR_model}.csv")) # Get the directory path
163
+ if not os.path.exists(output_dir): # Check if directory exists
164
+ os.makedirs(output_dir) # Create directory if it doesn't exist
165
+ print(f"Created directory: {output_dir}")
166
+
167
+ df.to_csv(output_csv_path, index=False)
168
+
169
+ print(f"\nπŸ“„ Transcripts saved to: {output_csv_path}")
170
+
171
+
172
 
173
  app = Flask(__name__)
174
 
 
195
  "Fairseq S2T",
196
  "ESPnet"
197
  ]
198
+ generateTranscript("openai/whisper-base", dataset, csv_path, output_dir="./") ;
199
+ # print("Transcript generation completed.")
200
  return jsonify({"asr_models": models})
201
 
202
  # if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,2 +1,17 @@
1
  flask
2
  gunicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  flask
2
  gunicorn
3
+ soundfile>=0.10.3
4
+ librosa
5
+ transformers
6
+ datasets
7
+ torch
8
+ pydub
9
+ jiwer
10
+ statsmodels
11
+ matplotlib
12
+ seaborn
13
+ flask
14
+ pymongo
15
+ flask-cors
16
+ pandas
17
+ tqdm
test.csv ADDED
The diff for this file is too large to render. See raw diff