harshith1411 commited on
Commit
ac292eb
·
verified ·
1 Parent(s): 976055f

Upload gen_ai(proj_47).ipynb

Browse files
Files changed (1) hide show
  1. gen_ai(proj_47).ipynb +857 -0
gen_ai(proj_47).ipynb ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "1UhcygQ3Xvl0",
11
+ "outputId": "ef65d3a7-2758-4345-8da8-ae899d2a73c1"
12
+ },
13
+ "outputs": [
14
+ {
15
+ "name": "stdout",
16
+ "output_type": "stream",
17
+ "text": [
18
+ "Collecting zipfile36\n",
19
+ " Downloading zipfile36-0.1.3-py3-none-any.whl.metadata (736 bytes)\n",
20
+ "Downloading zipfile36-0.1.3-py3-none-any.whl (20 kB)\n",
21
+ "Installing collected packages: zipfile36\n",
22
+ "Successfully installed zipfile36-0.1.3\n"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "!pip install zipfile36"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": null,
33
+ "metadata": {
34
+ "colab": {
35
+ "base_uri": "https://localhost:8080/"
36
+ },
37
+ "id": "ktv3__0PVhvW",
38
+ "outputId": "59911ce3-7b6b-4ca1-b518-536d2676fdaa"
39
+ },
40
+ "outputs": [
41
+ {
42
+ "output_type": "stream",
43
+ "name": "stdout",
44
+ "text": [
45
+ " context \\\n",
46
+ "0 Super Bowl 50 was an American football game to... \n",
47
+ "1 One of the most famous people born in Warsaw w... \n",
48
+ "2 The Normans (Norman: Nourmands; French: Norman... \n",
49
+ "3 Nikola Tesla (Serbian Cyrillic: Никола Тесла; ... \n",
50
+ "4 Computational complexity theory is a branch of... \n",
51
+ "\n",
52
+ " question \\\n",
53
+ "0 Which NFL team represented the AFC at Super Bo... \n",
54
+ "1 What was Maria Curie the first female recipien... \n",
55
+ "2 In what country is Normandy located? \n",
56
+ "3 In what year was Nikola Tesla born? \n",
57
+ "4 What branch of theoretical computer science de... \n",
58
+ "\n",
59
+ " answer \n",
60
+ "0 Denver Broncos \n",
61
+ "1 Nobel Prize \n",
62
+ "2 France \n",
63
+ "3 1856 \n",
64
+ "4 Computational complexity theory \n"
65
+ ]
66
+ }
67
+ ],
68
+ "source": [
69
+ "import pandas as pd\n",
70
+ "import zipfile\n",
71
+ "\n",
72
+ "# Load the dataset\n",
73
+ "def load_data(file_path):\n",
74
+ " with zipfile.ZipFile(file_path, 'r') as zip_ref:\n",
75
+ " # Get the first JSON file in the archive\n",
76
+ " json_file = [f for f in zip_ref.namelist() if f.endswith('.json')][0]\n",
77
+ " # Extract the JSON file to memory and load it as a DataFrame\n",
78
+ " with zip_ref.open(json_file) as f:\n",
79
+ " df = pd.read_json(f)\n",
80
+ " return df\n",
81
+ "\n",
82
+ "\n",
83
+ "# Preprocess the dataset\n",
84
+ "def preprocess_data(df):\n",
85
+ " df['context'] = df['data'].apply(lambda x: x['paragraphs'][0]['context'])\n",
86
+ " df['question'] = df['data'].apply(lambda x: x['paragraphs'][0]['qas'][0]['question'])\n",
87
+ " df['answer'] = df['data'].apply(lambda x: x['paragraphs'][0]['qas'][0]['answers'][0]['text'])\n",
88
+ " return df[['context', 'question', 'answer']]\n",
89
+ "\n",
90
+ "# Main function for loading and preprocessing\n",
91
+ "if __name__ == '__main__':\n",
92
+ " file_path = '/content/drive/MyDrive/archive (22).zip'\n",
93
+ " df = load_data(file_path)\n",
94
+ " df = preprocess_data(df)\n",
95
+ " print(df.head())"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {
102
+ "colab": {
103
+ "base_uri": "https://localhost:8080/"
104
+ },
105
+ "id": "s9iEqWHygwd2",
106
+ "outputId": "4b6c523d-8b6c-455b-ad2a-6a44f43854de"
107
+ },
108
+ "outputs": [
109
+ {
110
+ "output_type": "stream",
111
+ "name": "stdout",
112
+ "text": [
113
+ "Available topics in the dataset:\n",
114
+ "- Roman_Republic\n",
115
+ "- Prime_minister\n",
116
+ "- Daylight_saving_time\n",
117
+ "- Xbox_360\n",
118
+ "- Post-punk\n",
119
+ "- Database\n",
120
+ "- Beer\n",
121
+ "- ASCII\n",
122
+ "- Southeast_Asia\n",
123
+ "- Time\n",
124
+ "- Software_testing\n",
125
+ "- Classical_music\n",
126
+ "- Sumer\n",
127
+ "- Race_(human_categorization)\n",
128
+ "- Computer\n",
129
+ "- Himachal_Pradesh\n",
130
+ "- Hindu_philosophy\n",
131
+ "- Boston\n",
132
+ "- Cubism\n",
133
+ "- Pope_John_XXIII\n",
134
+ "- Seattle\n",
135
+ "- Alsace\n",
136
+ "- Ashkenazi_Jews\n",
137
+ "- Idealism\n",
138
+ "- Edmund_Burke\n",
139
+ "- Franco-Prussian_War\n",
140
+ "- Pope_Paul_VI\n",
141
+ "- Republic_of_the_Congo\n",
142
+ "- Montevideo\n",
143
+ "- Alexander_Graham_Bell\n",
144
+ "- CBC_Television\n",
145
+ "- MP3\n",
146
+ "- States_of_Germany\n",
147
+ "- Mammal\n",
148
+ "- 51st_state\n",
149
+ "- Hokkien\n",
150
+ "- Digestion\n",
151
+ "- Cyprus\n",
152
+ "- Southampton\n",
153
+ "- Russian_Soviet_Federative_Socialist_Republic\n",
154
+ "- British_Isles\n",
155
+ "- Digimon\n",
156
+ "- Anthropology\n",
157
+ "- Web_browser\n",
158
+ "- Green\n",
159
+ "- Mexico_City\n",
160
+ "- Slavs\n",
161
+ "- Communications_in_Somalia\n",
162
+ "- Insect\n",
163
+ "- Child_labour\n",
164
+ "- Orthodox_Judaism\n",
165
+ "- The_Sun_(United_Kingdom)\n",
166
+ "- Red\n",
167
+ "- Presbyterianism\n",
168
+ "- Elevator\n",
169
+ "- Punjab,_Pakistan\n",
170
+ "- Cardinal_(Catholicism)\n",
171
+ "- 2008_Sichuan_earthquake\n",
172
+ "- Samurai\n",
173
+ "- Association_football\n",
174
+ "- Identity_(social_science)\n",
175
+ "- Aircraft_carrier\n",
176
+ "- Group_(mathematics)\n",
177
+ "- United_States_dollar\n",
178
+ "- Political_philosophy\n",
179
+ "- Airport\n",
180
+ "- General_Electric\n",
181
+ "- Virgil\n",
182
+ "- Flowering_plant\n",
183
+ "- Circadian_rhythm\n",
184
+ "- Nonprofit_organization\n",
185
+ "- Comics\n",
186
+ "- The_Blitz\n",
187
+ "- Marvel_Comics\n",
188
+ "- Gymnastics\n",
189
+ "- United_States_Army\n",
190
+ "- The_Legend_of_Zelda:_Twilight_Princess\n",
191
+ "- Tuvalu\n",
192
+ "- Somalis\n",
193
+ "- Paris\n",
194
+ "- Antibiotics\n",
195
+ "- Neolithic\n",
196
+ "- Napoleon\n",
197
+ "- Treaty\n",
198
+ "- Raleigh,_North_Carolina\n",
199
+ "- Palermo\n",
200
+ "- Central_Intelligence_Agency\n",
201
+ "- Miami\n",
202
+ "- Pub\n",
203
+ "- Southern_Europe\n",
204
+ "- Szlachta\n",
205
+ "- Internet_service_provider\n",
206
+ "- Capacitor\n",
207
+ "- Military_history_of_the_United_States\n",
208
+ "- Spectre_(2015_film)\n",
209
+ "- Biodiversity\n",
210
+ "- Houston\n",
211
+ "- Arena_Football_League\n",
212
+ "- Guinea-Bissau\n",
213
+ "- Mary_(mother_of_Jesus)\n",
214
+ "- Switzerland\n",
215
+ "- Renewable_energy_commercialization\n",
216
+ "- Dutch_Republic\n",
217
+ "- Namibia\n",
218
+ "- Intellectual_property\n",
219
+ "- Videoconferencing\n",
220
+ "- Jehovah%27s_Witnesses\n",
221
+ "- Arsenal_F.C.\n",
222
+ "- London\n",
223
+ "- Atlantic_City,_New_Jersey\n",
224
+ "- Greeks\n",
225
+ "- Gamal_Abdel_Nasser\n",
226
+ "- YouTube\n",
227
+ "- Molotov%E2%80%93Ribbentrop_Pact\n",
228
+ "- St._John%27s,_Newfoundland_and_Labrador\n",
229
+ "- Hellenistic_period\n",
230
+ "- USB\n",
231
+ "- Hydrogen\n",
232
+ "- Late_Middle_Ages\n",
233
+ "- Neoclassical_architecture\n",
234
+ "- Somerset\n",
235
+ "- Westminster_Abbey\n",
236
+ "- Federal_Aviation_Administration\n",
237
+ "- Mali\n",
238
+ "- Uranium\n",
239
+ "- Oklahoma\n",
240
+ "- Chinese_characters\n",
241
+ "- Alfred_North_Whitehead\n",
242
+ "- Multiracial_American\n",
243
+ "- The_Bronx\n",
244
+ "- Protestantism\n",
245
+ "- Nanjing\n",
246
+ "- Royal_assent\n",
247
+ "- Near_East\n",
248
+ "- Adolescence\n",
249
+ "- Separation_of_church_and_state_in_the_United_States\n",
250
+ "- Infection\n",
251
+ "- Data_compression\n",
252
+ "- Glass\n",
253
+ "- BeiDou_Navigation_Satellite_System\n",
254
+ "- IPod\n",
255
+ "- Film_speed\n",
256
+ "- Estonian_language\n",
257
+ "- Universal_Studios\n",
258
+ "- Buckingham_Palace\n",
259
+ "- United_States_presidential_election,_2004\n",
260
+ "- History_of_India\n",
261
+ "- Aspirated_consonant\n",
262
+ "- Economy_of_Greece\n",
263
+ "- Matter\n",
264
+ "- Human_Development_Index\n",
265
+ "- History_of_science\n",
266
+ "- Royal_Institute_of_British_Architects\n",
267
+ "- Hunter-gatherer\n",
268
+ "- Iranian_languages\n",
269
+ "- Thuringia\n",
270
+ "- Financial_crisis_of_2007%E2%80%9308\n",
271
+ "- LaserDisc\n",
272
+ "- Dell\n",
273
+ "- Letter_case\n",
274
+ "- East_Prussia\n",
275
+ "- John_von_Neumann\n",
276
+ "- Crucifixion_of_Jesus\n",
277
+ "- Carnival\n",
278
+ "- Avicenna\n",
279
+ "- Northwestern_University\n",
280
+ "- Royal_Dutch_Shell\n",
281
+ "- Gene\n",
282
+ "- Crimean_War\n",
283
+ "- Pitch_(music)\n",
284
+ "- Materialism\n",
285
+ "- Vacuum\n",
286
+ "- Antarctica\n",
287
+ "- Race_and_ethnicity_in_the_United_States_Census\n",
288
+ "- Kathmandu\n",
289
+ "- Immaculate_Conception\n",
290
+ "- Copyright_infringement\n",
291
+ "- Liberal_Party_of_Australia\n",
292
+ "- Hunting\n",
293
+ "- Translation\n",
294
+ "- Elizabeth_II\n",
295
+ "- Czech_language\n",
296
+ "- Central_African_Republic\n",
297
+ "- Humanism\n",
298
+ "- Geography_of_the_United_States\n",
299
+ "- Rule_of_law\n",
300
+ "- Egypt\n",
301
+ "- Communication\n",
302
+ "- Transistor\n",
303
+ "- United_States_Air_Force\n",
304
+ "- Saint_Helena\n",
305
+ "- Greece\n",
306
+ "- San_Diego\n",
307
+ "- Mandolin\n",
308
+ "- Beyoncé\n",
309
+ "- Ottoman_Empire\n",
310
+ "- Bern\n",
311
+ "- Everton_F.C.\n",
312
+ "- Hanover\n",
313
+ "- Galicia_(Spain)\n",
314
+ "- Great_power\n",
315
+ "- Catalan_language\n",
316
+ "- Madonna_(entertainer)\n",
317
+ "- Brigham_Young_University\n",
318
+ "- PlayStation_3\n",
319
+ "- Madrasa\n",
320
+ "- Great_Plains\n",
321
+ "- Israel\n",
322
+ "- Pacific_War\n",
323
+ "- Cotton\n",
324
+ "- Nigeria\n",
325
+ "- Ann_Arbor,_Michigan\n",
326
+ "- Geological_history_of_Earth\n",
327
+ "- Hard_rock\n",
328
+ "- Umayyad_Caliphate\n",
329
+ "- Oklahoma_City\n",
330
+ "- Myanmar\n",
331
+ "- Norfolk_Island\n",
332
+ "- Florida\n",
333
+ "- Dwight_D._Eisenhower\n",
334
+ "- Swaziland\n",
335
+ "- Annelid\n",
336
+ "- Apollo\n",
337
+ "- American_Idol\n",
338
+ "- Nutrition\n",
339
+ "- Ministry_of_Defence_(United_Kingdom)\n",
340
+ "- Neptune\n",
341
+ "- Hyderabad\n",
342
+ "- Animal\n",
343
+ "- Valencia\n",
344
+ "- Wayback_Machine\n",
345
+ "- Exhibition_game\n",
346
+ "- Light-emitting_diode\n",
347
+ "- Modern_history\n",
348
+ "- University_of_Kansas\n",
349
+ "- Bird\n",
350
+ "- Richard_Feynman\n",
351
+ "- Tennessee\n",
352
+ "- Party_leaders_of_the_United_States_House_of_Representatives\n",
353
+ "- United_Nations_Population_Fund\n",
354
+ "- Estonia\n",
355
+ "- Sexual_orientation\n",
356
+ "- Copper\n",
357
+ "- IBM\n",
358
+ "- Washington_University_in_St._Louis\n",
359
+ "- Imperial_College_London\n",
360
+ "- Empiricism\n",
361
+ "- Separation_of_powers_under_the_United_States_Constitution\n",
362
+ "- Samoa\n",
363
+ "- Liberia\n",
364
+ "- Muammar_Gaddafi\n",
365
+ "- Imamah_(Shia_doctrine)\n",
366
+ "- Jews\n",
367
+ "- List_of_numbered_streets_in_Manhattan\n",
368
+ "- Tibet\n",
369
+ "- Serbo-Croatian\n",
370
+ "- Railway_electrification_system\n",
371
+ "- Pain\n",
372
+ "- Alloy\n",
373
+ "- Lancashire\n",
374
+ "- Law_of_the_United_States\n",
375
+ "- Infrared\n",
376
+ "- Tuberculosis\n",
377
+ "- Tristan_da_Cunha\n",
378
+ "- Buddhism\n",
379
+ "- John,_King_of_England\n",
380
+ "- Premier_League\n",
381
+ "- Frédéric_Chopin\n",
382
+ "- Phonology\n",
383
+ "- Solar_energy\n",
384
+ "- Steven_Spielberg\n",
385
+ "- Department_store\n",
386
+ "- Armenia\n",
387
+ "- Genocide\n",
388
+ "- Heian_period\n",
389
+ "- House_music\n",
390
+ "- Strasbourg\n",
391
+ "- BBC_Television\n",
392
+ "- Incandescent_light_bulb\n",
393
+ "- Muslim_world\n",
394
+ "- Tucson,_Arizona\n",
395
+ "- Nintendo_Entertainment_System\n",
396
+ "- Utrecht\n",
397
+ "- Bird_migration\n",
398
+ "- Arnold_Schwarzenegger\n",
399
+ "- Bacteria\n",
400
+ "- Melbourne\n",
401
+ "- Charleston,_South_Carolina\n",
402
+ "- Printed_circuit_board\n",
403
+ "- Affirmative_action_in_the_United_States\n",
404
+ "- Philadelphia\n",
405
+ "- Old_English\n",
406
+ "- Sino-Tibetan_relations_during_the_Ming_dynasty\n",
407
+ "- Adult_contemporary_music\n",
408
+ "- Saint_Barth%C3%A9lemy\n",
409
+ "- Eritrea\n",
410
+ "- Yale_University\n",
411
+ "- Super_Nintendo_Entertainment_System\n",
412
+ "- Federalism\n",
413
+ "- Rajasthan\n",
414
+ "- The_Times\n",
415
+ "- Political_party\n",
416
+ "- Diarrhea\n",
417
+ "- Wood\n",
418
+ "- Santa_Monica,_California\n",
419
+ "- Zinc\n",
420
+ "- Unicode\n",
421
+ "- On_the_Origin_of_Species\n",
422
+ "- Guam\n",
423
+ "- Black_people\n",
424
+ "- Richmond,_Virginia\n",
425
+ "- 2008_Summer_Olympics_torch_relay\n",
426
+ "- Friedrich_Hayek\n",
427
+ "- Indigenous_peoples_of_the_Americas\n",
428
+ "- Gothic_architecture\n",
429
+ "- Institute_of_technology\n",
430
+ "- Grape\n",
431
+ "- Bermuda\n",
432
+ "- Middle_Ages\n",
433
+ "- Christian\n",
434
+ "- Asthma\n",
435
+ "- Bill_%26_Melinda_Gates_Foundation\n",
436
+ "- Heresy\n",
437
+ "- Chicago_Cubs\n",
438
+ "- Mosaic\n",
439
+ "- FA_Cup\n",
440
+ "- Queen_(band)\n",
441
+ "- FC_Barcelona\n",
442
+ "- Appalachian_Mountains\n",
443
+ "- Dialect\n",
444
+ "- Korean_War\n",
445
+ "- British_Empire\n",
446
+ "- God\n",
447
+ "- Georgian_architecture\n",
448
+ "- A_cappella\n",
449
+ "- Karl_Popper\n",
450
+ "- Seven_Years%27_War\n",
451
+ "- Kanye_West\n",
452
+ "- Predation\n",
453
+ "- Josip_Broz_Tito\n",
454
+ "- Textual_criticism\n",
455
+ "- University\n",
456
+ "- Electric_motor\n",
457
+ "- New_Delhi\n",
458
+ "- England_national_football_team\n",
459
+ "- University_of_Notre_Dame\n",
460
+ "- Sony_Music_Entertainment\n",
461
+ "- Portugal\n",
462
+ "- George_VI\n",
463
+ "- Asphalt\n",
464
+ "- Energy\n",
465
+ "- Immunology\n",
466
+ "- Pesticide\n",
467
+ "- European_Central_Bank\n",
468
+ "- Emotion\n",
469
+ "- Windows_8\n",
470
+ "- Memory\n",
471
+ "- North_Carolina\n",
472
+ "- Freemasonry\n",
473
+ "- Philosophy_of_space_and_time\n",
474
+ "- Police\n",
475
+ "- Canadian_football\n",
476
+ "- Symbiosis\n",
477
+ "- Gramophone_record\n",
478
+ "- Clothing\n",
479
+ "- Poultry\n",
480
+ "- Armenians\n",
481
+ "- Dog\n",
482
+ "- Planck_constant\n",
483
+ "- High-definition_television\n",
484
+ "- Germans\n",
485
+ "- Capital_punishment_in_the_United_States\n",
486
+ "- Professional_wrestling\n",
487
+ "- Political_corruption\n",
488
+ "- Dominican_Order\n",
489
+ "- Endangered_Species_Act\n",
490
+ "- Zhejiang\n",
491
+ "- Canadian_Armed_Forces\n",
492
+ "- Pharmaceutical_industry\n",
493
+ "- Alaska\n",
494
+ "- New_York_City\n",
495
+ "- Macintosh\n",
496
+ "- Botany\n",
497
+ "- Tajikistan\n",
498
+ "- Federal_Bureau_of_Investigation\n",
499
+ "- Cork_(city)\n",
500
+ "- Dissolution_of_the_Soviet_Union\n",
501
+ "- Comcast\n",
502
+ "- Labour_Party_(UK)\n",
503
+ "- New_Haven,_Connecticut\n",
504
+ "- Order_of_the_British_Empire\n",
505
+ "- Dutch_language\n",
506
+ "- Compact_disc\n",
507
+ "- Bras%C3%ADlia\n",
508
+ "- To_Kill_a_Mockingbird\n",
509
+ "- Sichuan\n",
510
+ "- John_Kerry\n",
511
+ "- Computer_security\n",
512
+ "- Sanskrit\n",
513
+ "- Detroit\n",
514
+ "- Athanasius_of_Alexandria\n",
515
+ "- Space_Race\n",
516
+ "- Anti-aircraft_warfare\n",
517
+ "- Baptists\n",
518
+ "- Quran\n",
519
+ "- Architecture\n",
520
+ "- Myocardial_infarction\n",
521
+ "- Eton_College\n",
522
+ "- Mesozoic\n",
523
+ "- Qing_dynasty\n",
524
+ "- Montana\n",
525
+ "- Education\n",
526
+ "- Literature\n",
527
+ "- Comprehensive_school\n",
528
+ "- Plymouth\n",
529
+ "- Glacier\n",
530
+ "- Lighting\n",
531
+ "- Turner_Classic_Movies\n",
532
+ "- Queen_Victoria\n",
533
+ "- Paper\n",
534
+ "- East_India_Company\n",
535
+ "- Spanish_language_in_the_United_States\n",
536
+ "- Han_dynasty\n",
537
+ "- Gregorian_calendar\n",
538
+ "- Supreme_court\n",
539
+ "- Sahara\n",
540
+ "- Culture\n",
541
+ "- Religion_in_ancient_Rome\n",
542
+ "- Chihuahua_(state)\n",
543
+ "- Canon_law\n",
544
+ "- Kievan_Rus%27\n",
545
+ "- National_Archives_and_Records_Administration\n",
546
+ "- Marshall_Islands\n",
547
+ "- Alps\n",
548
+ "- Age_of_Enlightenment\n",
549
+ "- War_on_Terror\n",
550
+ "- Russian_language\n",
551
+ "- Iran\n",
552
+ "- Genome\n",
553
+ "- Antenna_(radio)\n",
554
+ "- Brain\n",
555
+ "- Warsaw_Pact\n"
556
+ ]
557
+ }
558
+ ],
559
+ "source": [
560
+ "import pandas as pd\n",
561
+ "import zipfile\n",
562
+ "\n",
563
+ "def extract_topics():\n",
564
+ " # Load the dataset\n",
565
+ " # Instead of directly reading the zip file, extract the relevant JSON file first.\n",
566
+ " with zipfile.ZipFile('/content/drive/MyDrive/archive (22).zip', 'r') as zip_ref:\n",
567
+ " # Assuming you want to use 'train-v1.1.json', change this if needed.\n",
568
+ " json_file = 'train-v1.1.json'\n",
569
+ " with zip_ref.open(json_file) as f:\n",
570
+ " data = pd.read_json(f)\n",
571
+ " topics = set()\n",
572
+ "\n",
573
+ " # Extract unique topic names from the dataset\n",
574
+ " for item in data['data']:\n",
575
+ " for paragraph in item['paragraphs']:\n",
576
+ " topics.add(item['title']) # Assuming 'title' represents the topic\n",
577
+ "\n",
578
+ " return topics\n",
579
+ "\n",
580
+ "if __name__ == \"__main__\":\n",
581
+ " topics = extract_topics()\n",
582
+ " print(\"Available topics in the dataset:\")\n",
583
+ " for topic in topics:\n",
584
+ " print(f\"- {topic}\")"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "source": [
590
+ "import pandas as pd\n",
591
+ "import random\n",
592
+ "import zipfile\n",
593
+ "\n",
594
+ "def generate_incorrect_options(data, correct_answer, topic):\n",
595
+ " \"\"\"Generate plausible incorrect options based on the dataset and the topic context.\"\"\"\n",
596
+ " incorrect_options = set()\n",
597
+ "\n",
598
+ " # Collect all possible answers from the dataset that are related to the topic\n",
599
+ " all_answers = []\n",
600
+ " for item in data['data']:\n",
601
+ " for paragraph in item['paragraphs']:\n",
602
+ " for qa in paragraph['qas']:\n",
603
+ " if qa['answers']:\n",
604
+ " for ans in qa['answers']:\n",
605
+ " if topic.lower() in ans['text'].lower(): # Check if the answer is related to the topic\n",
606
+ " all_answers.append(ans['text'])\n",
607
+ "\n",
608
+ " # Remove the correct answer from the list of possible incorrect options\n",
609
+ " all_answers = list(set(all_answers)) # Remove duplicates\n",
610
+ " if correct_answer in all_answers:\n",
611
+ " all_answers.remove(correct_answer)\n",
612
+ "\n",
613
+ " # Randomly select three unique incorrect options\n",
614
+ " while len(incorrect_options) < 3 and all_answers:\n",
615
+ " incorrect_option = random.choice(all_answers)\n",
616
+ " incorrect_options.add(incorrect_option)\n",
617
+ "\n",
618
+ " return list(incorrect_options)\n",
619
+ "\n",
620
+ "def generate_fill_in_the_blank(question, correct_answer):\n",
621
+ " \"\"\"Generate a fill-in-the-blank question with a placeholder.\"\"\"\n",
622
+ " return question.replace(correct_answer, \"______\")\n",
623
+ "\n",
624
+ "def generate_true_false_question(question, correct_answer):\n",
625
+ " \"\"\"Generate a true/false question.\"\"\"\n",
626
+ " return f\"True or False: {question} (Answer: {'True' if correct_answer else 'False'})\"\n",
627
+ "\n",
628
+ "def generate_quiz_from_dataset(topic, num_questions, question_type):\n",
629
+ " # Extract and load the dataset from the zip file\n",
630
+ " with zipfile.ZipFile('/content/drive/MyDrive/archive (22).zip', 'r') as zip_ref:\n",
631
+ " # Use the 'train-v1.1.json' file within the zip archive\n",
632
+ " json_file = 'train-v1.1.json'\n",
633
+ " with zip_ref.open(json_file) as f:\n",
634
+ " data = pd.read_json(f)\n",
635
+ "\n",
636
+ " questions = []\n",
637
+ "\n",
638
+ " # Filter questions based on the topic\n",
639
+ " for item in data['data']:\n",
640
+ " for paragraph in item['paragraphs']:\n",
641
+ " for qa in paragraph['qas']:\n",
642
+ " question = qa['question']\n",
643
+ " answer = qa['answers'][0]['text'] if qa['answers'] else \"No answer available\"\n",
644
+ " if topic.lower() in question.lower():\n",
645
+ " questions.append((question, answer))\n",
646
+ "\n",
647
+ " if not questions:\n",
648
+ " print(\"No questions found for this topic.\")\n",
649
+ " return\n",
650
+ "\n",
651
+ " # Shuffle questions and select the specified number\n",
652
+ " random.shuffle(questions)\n",
653
+ " selected_questions = questions[:num_questions]\n",
654
+ "\n",
655
+ " for index, (question, answer) in enumerate(selected_questions, start=1):\n",
656
+ " if question_type == 'mcq':\n",
657
+ " print(f\"Question {index}: {question}\")\n",
658
+ "\n",
659
+ " # Generate plausible incorrect options based on the dataset and topic context\n",
660
+ " incorrect_options = generate_incorrect_options(data, answer, topic)\n",
661
+ "\n",
662
+ " # Combine correct answer with incorrect options\n",
663
+ " options = [answer] + incorrect_options\n",
664
+ " random.shuffle(options)\n",
665
+ "\n",
666
+ " # Format options as A, B, C, D\n",
667
+ " for i, option in enumerate(options):\n",
668
+ " print(f\"{chr(65 + i)}) {option}\")\n",
669
+ "\n",
670
+ " # Find the correct answer's position\n",
671
+ " correct_index = options.index(answer)\n",
672
+ " print(f\"Answer: {chr(65 + correct_index)}) {answer}\")\n",
673
+ " print(\"=\" * 50)\n",
674
+ "\n",
675
+ " elif question_type == 'fill-in-the-blank':\n",
676
+ " fill_in_question = generate_fill_in_the_blank(question, answer)\n",
677
+ " print(f\"Fill in the blank {index}: {fill_in_question}\")\n",
678
+ " print(f\"Answer: {answer}\")\n",
679
+ " print(\"=\" * 50)\n",
680
+ "\n",
681
+ " elif question_type == 'true/false':\n",
682
+ " true_false_question = generate_true_false_question(question, answer)\n",
683
+ " print(f\"True/False Question {index}: {true_false_question}\")\n",
684
+ " print(\"=\" * 50)\n",
685
+ "\n",
686
+ "if __name__ == \"__main__\":\n",
687
+ " topic = input(\"Enter the topic for the quiz: \")\n",
688
+ " num_questions = int(input(\"Enter the number of questions to generate: \"))\n",
689
+ " question_type = input(\"Enter question type (mcq/fill-in-the-blank/true/false): \").lower()\n",
690
+ " generate_quiz_from_dataset(topic, num_questions, question_type)\n"
691
+ ],
692
+ "metadata": {
693
+ "colab": {
694
+ "base_uri": "https://localhost:8080/"
695
+ },
696
+ "id": "xklEl5W_jqYM",
697
+ "outputId": "35b6b061-037a-4936-aa43-de77cbd6ef77"
698
+ },
699
+ "execution_count": null,
700
+ "outputs": [
701
+ {
702
+ "output_type": "stream",
703
+ "name": "stdout",
704
+ "text": [
705
+ "Enter the topic for the quiz: Database\n",
706
+ "Enter the number of questions to generate: 5\n",
707
+ "Enter question type (mcq/fill-in-the-blank/true/false): mcq\n",
708
+ "Question 1: What was the name of the database product created by IBM?\n",
709
+ "A) SQL/DS, and, later, Database 2 (DB2)\n",
710
+ "B) not be placed in the database\n",
711
+ "C) unauthorized users from viewing or updating the database\n",
712
+ "D) Database Task Group\n",
713
+ "Answer: A) SQL/DS, and, later, Database 2 (DB2)\n",
714
+ "==================================================\n",
715
+ "Question 2: The IANA database works by connecting names to what information about the location?\n",
716
+ "A) database is found corrupted\n",
717
+ "B) managing personal databases\n",
718
+ "C) database related application\n",
719
+ "D) historical and predicted clock shifts\n",
720
+ "Answer: D) historical and predicted clock shifts\n",
721
+ "==================================================\n",
722
+ "Question 3: How are today's database systems run?\n",
723
+ "A) structured document-oriented database\n",
724
+ "B) general-purpose hardware\n",
725
+ "C) database technology\n",
726
+ "D) SQL/DS, and, later, Database 2 (DB2)\n",
727
+ "Answer: B) general-purpose hardware\n",
728
+ "==================================================\n",
729
+ "Question 4: What database language is the most prominent?\n",
730
+ "A) unauthorized users from viewing or updating the database\n",
731
+ "B) SQL\n",
732
+ "C) relational database management system\n",
733
+ "D) database technology\n",
734
+ "Answer: B) SQL\n",
735
+ "==================================================\n",
736
+ "Question 5: What does database access limit?\n",
737
+ "A) by a \"database management system\" (DBMS)\n",
738
+ "B) who (a person or a certain computer program) is allowed to access what information\n",
739
+ "C) database technology\n",
740
+ "D) database is found corrupted\n",
741
+ "Answer: B) who (a person or a certain computer program) is allowed to access what information\n",
742
+ "==================================================\n"
743
+ ]
744
+ }
745
+ ]
746
+ },
747
+ {
748
+ "cell_type": "code",
749
+ "source": [
750
+ "import pandas as pd\n",
751
+ "import seaborn as sns\n",
752
+ "import matplotlib.pyplot as plt\n",
753
+ "import zipfile\n",
754
+ "\n",
755
+ "# Load dataset from a URL (replace with your link)\n",
756
+ "dataset_url = '/content/drive/MyDrive/archive (22).zip'\n",
757
+ "\n",
758
+ "json_file = 'train-v1.1.json'\n",
759
+ "\n",
760
+ "# Extract the JSON file from the zip archive\n",
761
+ "with zipfile.ZipFile(dataset_url, 'r') as zip_ref:\n",
762
+ " with zip_ref.open(json_file) as f:\n",
763
+ " df = pd.read_json(f)\n",
764
+ "\n",
765
+ "# Preview the dataset\n",
766
+ "print(df.head())\n",
767
+ "\n",
768
+ "\n",
769
+ "numerical_df = df[['version']].copy()\n",
770
+ "if not numerical_df.empty:\n",
771
+ " plt.figure(figsize=(8, 6))\n",
772
+ " sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm')\n",
773
+ " plt.title('Heatmap of Quiz Data Correlations (Numerical Columns)')\n",
774
+ " plt.show()\n",
775
+ "else:\n",
776
+ " print(\"No numerical columns found for correlation analysis.\")\n",
777
+ "\n",
778
+ "\n",
779
+ "plt.figure(figsize=(8, 6))\n",
780
+ "question_counts = df['data'].apply(lambda x: len(x['paragraphs'][0]['qas'])).value_counts()\n",
781
+ "question_counts.plot(kind='bar')\n",
782
+ "plt.title('Bar Plot of Question Frequency')\n",
783
+ "plt.xlabel('Number of Questions')\n",
784
+ "plt.ylabel('Frequency')\n",
785
+ "plt.xticks(rotation=0)\n",
786
+ "plt.show()"
787
+ ],
788
+ "metadata": {
789
+ "colab": {
790
+ "base_uri": "https://localhost:8080/",
791
+ "height": 1000
792
+ },
793
+ "id": "x1TC97wS2zJq",
794
+ "outputId": "35056826-a310-4873-d048-c77929416a13"
795
+ },
796
+ "execution_count": 5,
797
+ "outputs": [
798
+ {
799
+ "output_type": "stream",
800
+ "name": "stdout",
801
+ "text": [
802
+ " data version\n",
803
+ "0 {'title': 'University_of_Notre_Dame', 'paragra... 1.1\n",
804
+ "1 {'title': 'Beyoncé', 'paragraphs': [{'context'... 1.1\n",
805
+ "2 {'title': 'Montana', 'paragraphs': [{'context'... 1.1\n",
806
+ "3 {'title': 'Genocide', 'paragraphs': [{'context... 1.1\n",
807
+ "4 {'title': 'Antibiotics', 'paragraphs': [{'cont... 1.1\n"
808
+ ]
809
+ },
810
+ {
811
+ "output_type": "stream",
812
+ "name": "stderr",
813
+ "text": [
814
+ "/usr/local/lib/python3.11/dist-packages/seaborn/matrix.py:202: RuntimeWarning: All-NaN slice encountered\n",
815
+ " vmin = np.nanmin(calc_data)\n",
816
+ "/usr/local/lib/python3.11/dist-packages/seaborn/matrix.py:207: RuntimeWarning: All-NaN slice encountered\n",
817
+ " vmax = np.nanmax(calc_data)\n"
818
+ ]
819
+ },
820
+ {
821
+ "output_type": "display_data",
822
+ "data": {
823
+ "text/plain": [
824
+ "<Figure size 800x600 with 2 Axes>"
825
+ ],
826
+ "image/png": "\n"
827
+ },
828
+ "metadata": {}
829
+ },
830
+ {
831
+ "output_type": "display_data",
832
+ "data": {
833
+ "text/plain": [
834
+ "<Figure size 800x600 with 1 Axes>"
835
+ ],
836
+ "image/png": "\n"
837
+ },
838
+ "metadata": {}
839
+ }
840
+ ]
841
+ }
842
+ ],
843
+ "metadata": {
844
+ "colab": {
845
+ "provenance": []
846
+ },
847
+ "kernelspec": {
848
+ "display_name": "Python 3",
849
+ "name": "python3"
850
+ },
851
+ "language_info": {
852
+ "name": "python"
853
+ }
854
+ },
855
+ "nbformat": 4,
856
+ "nbformat_minor": 0
857
+ }