karouswissem commited on
Commit
3291f62
·
1 Parent(s): cc9afbc
Netflix_Recommendation_Notebook_Code ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #ran on Kaggle
3
+ !pip install sentence-transformers
4
+ !pip install torch
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer
7
+ import numpy as np
8
+ import pandas as pd
9
+ from tqdm import tqdm # For tracking progress in batches
10
+
11
+ # check if GPU is available
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ print(f"Using device: {device}")
14
+
15
+ # load dataset
16
+ dataset = pd.read_csv('/kaggle/input/d/infamouscoder/dataset-netflix-shows/netflix_titles.csv')
17
+
18
+ # load model to GPU if available
19
+ model = SentenceTransformer("all-MiniLM-L6-v2").to(device)
20
+
21
+ # combine fields (title, genre, description) for embeddings
22
+ def combine_description_title_and_genre(description, listed_in, title):
23
+ return f"{description} Genre: {listed_in} Title: {title}"
24
+
25
+ # create combined text column
26
+ dataset['combined_text'] = dataset.apply(lambda row: combine_description_title_and_genre(row['description'], row['listed_in'], row['title']), axis=1)
27
+
28
+ # generate embeddings in batches to save memory
29
+ batch_size = 32
30
+ embeddings = []
31
+
32
+ for i in tqdm(range(0, len(dataset), batch_size), desc="Generating Embeddings"):
33
+ batch_texts = dataset['combined_text'][i:i+batch_size].tolist()
34
+ batch_embeddings = model.encode(batch_texts, convert_to_tensor=True, device=device)
35
+ embeddings.extend(batch_embeddings.cpu().numpy()) # move to CPU to save memory
36
+
37
+ # convert list to numpy array
38
+ embeddings = np.array(embeddings)
39
+
40
+ # save embeddings and metadata
41
+ np.save("/kaggle/working/netflix_embeddings.npy", embeddings)
42
+ dataset[['show_id', 'title', 'description', 'listed_in']].to_csv("/kaggle/working/netflix_metadata.csv", index=False)
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: The Maze RS
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.26.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Netflix Recommendation
3
+ emoji: 📈
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Recommends Netflix Show/Movie based on description and genre
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gradio Interface
2
+ import gradio as gr
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sentence_transformers import SentenceTransformer
7
+ import requests
8
+ from PIL import Image
9
+ from transformers import BlipProcessor, BlipForConditionalGeneration
10
+ sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
11
+
12
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
13
+ image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
14
+
15
+ def generate_input(input_type, image=None, text=None, response_amount=3):
16
+ # initalize input variable
17
+ combined_input = ""
18
+
19
+ # handle image input if chosen
20
+ if input_type == "Image" and image:
21
+ inputs = processor(images=image, return_tensors="pt") #process image with BlipProcessor
22
+ out = image_model.generate(**inputs) #generate caption with BlipModel
23
+ image_caption = processor.decode(out[0], skip_special_tokens=True) #decode output w/ processor
24
+ combined_input += image_caption # add the image caption to input
25
+
26
+ # handle text input if chosen
27
+ elif input_type == "Text" and text:
28
+ combined_input += text # add the text to input
29
+
30
+ # handle both text and image input if chosen
31
+ elif input_type == "Both" and image and text:
32
+ inputs = processor(images=image, return_tensors="pt")
33
+ out = image_model.generate(**inputs)
34
+ image_caption = processor.decode(out[0], skip_special_tokens=True) #repeat image processing + caption generation and decoding
35
+ combined_input += image_caption + " and " + text # combine image caption and text
36
+
37
+ # if no input, fallback
38
+ if not combined_input:
39
+ combined_input = "No input provided."
40
+ if response_amount is None:
41
+ response_amount=3
42
+
43
+ return vector_search(combined_input,response_amount) #search through embedded document w/ input
44
+
45
+ # load embeddings and metadata
46
+ embeddings = np.load("netflix_embeddings.npy") #created using sentence_transformers on kaggle
47
+ metadata = pd.read_csv("netflix_metadata.csv") #created using sentence_transformers on kaggle
48
+
49
+ # vector search function
50
+ def vector_search(query,top_n=3):
51
+ query_embedding = sentence_model.encode(query) #encode input w/ Sentence Transformers
52
+ similarities = cosine_similarity([query_embedding], embeddings)[0] #similarity function
53
+ if top_n is None:
54
+ top_n=3
55
+ top_indices = similarities.argsort()[-top_n:][::-1] #return top n indices based on chosen output amount
56
+ results = metadata.iloc[top_indices] #get metadata
57
+ result_text=""
58
+ for index,row in results.iterrows(): #loop through results to get Title, Description, and Genre for top n outputs
59
+ if index!=top_n-1:
60
+ result_text+=f"Title: {row['title']} Description: {row['description']} Genre: {row['listed_in']}\n\n"
61
+ else:
62
+ result_text+=f"Title: {row['title']} Description: {row['description']} Genre: {row['listed_in']}"
63
+ return result_text
64
+
65
+
66
+ def set_response_amount(response_amount): #set response amount
67
+ if response_amount is None:
68
+ return 3
69
+ return response_amount
70
+
71
+ # based on the selected input type, make the appropriate input visible
72
+ def update_inputs(input_type):
73
+ if input_type == "Image":
74
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
75
+ elif input_type == "Text":
76
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
77
+ elif input_type == "Both":
78
+ return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
79
+ with gr.Blocks() as demo:
80
+ gr.Markdown("# Netflix Recommendation System")
81
+ gr.Markdown("Enter a query to receive Netflix show recommendations based on title, description, and genre.")
82
+
83
+ input_type = gr.Radio(["Image", "Text", "Both"], label="Select Input Type", type="value")
84
+ response_type=gr.Dropdown(choices=[3,5,10,25], type="value", label="Select Response Amount", visible=False)
85
+ image_input = gr.Image(label="Upload Image", type="pil", visible=False) # Hidden initially
86
+ text_input = gr.Textbox(label="Enter Text Query", placeholder="Enter a description or query here", visible=False) # hidden initially
87
+
88
+ input_type.change(fn=update_inputs, inputs=input_type, outputs=[image_input, text_input, response_type])
89
+ # state variable to store the selected response amount
90
+ selected_response_amount = gr.State()
91
+
92
+ # capture response amount immediately when dropdown changes
93
+ response_type.change(fn=set_response_amount, inputs=response_type, outputs=selected_response_amount)
94
+
95
+ submit_button = gr.Button("Submit")
96
+ output = gr.Textbox(label="Recommendations")
97
+ if selected_response_amount is None:
98
+ selected_response_amount=3
99
+
100
+ submit_button.click(fn=generate_input, inputs=[input_type,image_input, text_input,selected_response_amount], outputs=output)
101
+ demo.launch()
netflix_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07256cdb32b19dd08130152df55edbeb2da75211ba101fe4305020679bf1e225
3
+ size 13527680
netflix_metadata.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.1.1
2
+ aiofiles==23.2.1
3
+ aiohappyeyeballs==2.4.3
4
+ aiohttp==3.10.10
5
+ aiosignal==1.3.1
6
+ annotated-types==0.7.0
7
+ anyio==4.6.2.post1
8
+ attrs==24.2.0
9
+ audioread==3.0.1
10
+ certifi==2024.8.30
11
+ cffi==1.17.1
12
+ charset-normalizer==3.4.0
13
+ click==8.1.7
14
+ contourpy==1.3.0
15
+ cycler==0.12.1
16
+ datasets==3.1.0
17
+ decorator==5.1.1
18
+ dill==0.3.8
19
+ fastapi==0.115.4
20
+ ffmpy==0.4.0
21
+ filelock==3.16.1
22
+ fonttools==4.54.1
23
+ frozenlist==1.5.0
24
+ fsspec==2024.9.0
25
+ gradio==5.5.0
26
+ gradio_client==1.4.2
27
+ h11==0.14.0
28
+ httpcore==1.0.6
29
+ httpx==0.27.2
30
+ huggingface-hub==0.26.2
31
+ idna==3.10
32
+ Jinja2==3.1.4
33
+ joblib==1.4.2
34
+ kiwisolver==1.4.7
35
+ lazy_loader==0.4
36
+ librosa==0.10.2.post1
37
+ llvmlite==0.43.0
38
+ markdown-it-py==3.0.0
39
+ MarkupSafe==2.1.5
40
+ matplotlib==3.9.2
41
+ mdurl==0.1.2
42
+ mpmath==1.3.0
43
+ msgpack==1.1.0
44
+ multidict==6.1.0
45
+ multiprocess==0.70.16
46
+ networkx==3.4.2
47
+ numba==0.60.0
48
+ numpy==2.0.2
49
+ orjson==3.10.11
50
+ packaging==24.1
51
+ pandas==2.2.3
52
+ pillow==11.0.0
53
+ platformdirs==4.3.6
54
+ pooch==1.8.2
55
+ propcache==0.2.0
56
+ psutil==6.1.0
57
+ pyarrow==18.0.0
58
+ pycparser==2.22
59
+ pydantic==2.9.2
60
+ pydantic_core==2.23.4
61
+ accelerate==1.1.1
62
+ aiofiles==23.2.1
63
+ aiohappyeyeballs==2.4.3
64
+ aiohttp==3.10.10
65
+ aiosignal==1.3.1
66
+ annotated-types==0.7.0
67
+ anyio==4.6.2.post1
68
+ attrs==24.2.0
69
+ audioread==3.0.1
70
+ certifi==2024.8.30
71
+ cffi==1.17.1
72
+ charset-normalizer==3.4.0
73
+ click==8.1.7
74
+ contourpy==1.3.0
75
+ cycler==0.12.1
76
+ datasets==3.1.0
77
+ decorator==5.1.1
78
+ dill==0.3.8
79
+ fastapi==0.115.4
80
+ ffmpy==0.4.0
81
+ filelock==3.16.1
82
+ fonttools==4.54.1
83
+ frozenlist==1.5.0
84
+ fsspec==2024.9.0
85
+ gradio==5.5.0
86
+ gradio_client==1.4.2
87
+ h11==0.14.0
88
+ httpcore==1.0.6
89
+ httpx==0.27.2
90
+ huggingface-hub==0.26.2
91
+ idna==3.10
92
+ Jinja2==3.1.4
93
+ joblib==1.4.2
94
+ kiwisolver==1.4.7
95
+ lazy_loader==0.4
96
+ librosa==0.10.2.post1
97
+ llvmlite==0.43.0
98
+ markdown-it-py==3.0.0
99
+ MarkupSafe==2.1.5
100
+ matplotlib==3.9.2
101
+ mdurl==0.1.2
102
+ mpmath==1.3.0
103
+ msgpack==1.1.0
104
+ multidict==6.1.0
105
+ multiprocess==0.70.16
106
+ networkx==3.4.2
107
+ numba==0.60.0
108
+ numpy==2.0.2
109
+ orjson==3.10.11
110
+ packaging==24.1
111
+ pandas==2.2.3
112
+ pillow==11.0.0
113
+ platformdirs==4.3.6
114
+ pooch==1.8.2
115
+ propcache==0.2.0
116
+ psutil==6.1.0
117
+ pyarrow==18.0.0
118
+ pycparser==2.22
119
+ pydantic==2.9.2
120
+ pydantic_core==2.23.4
121
+ pydub==0.25.1
122
+ Pygments==2.18.0
123
+ pyparsing==3.2.0
124
+ python-dateutil==2.9.0.post0
125
+ python-multipart==0.0.12
126
+ pytz==2024.2
127
+ PyYAML==6.0.2
128
+ regex==2024.11.6
129
+ requests==2.32.3
130
+ rich==13.9.4
131
+ ruff==0.7.2
132
+ safehttpx==0.1.1
133
+ safetensors==0.4.5
134
+ scikit-learn==1.5.2
135
+ scipy==1.14.1
136
+ semantic-version==2.10.0
137
+ sentence_transformers
138
+ shellingham==1.5.4
139
+ six==1.16.0
140
+ sniffio==1.3.1
141
+ soundfile==0.12.1
142
+ soxr==0.5.0.post1
143
+ starlette==0.41.2
144
+ sympy==1.13.1
145
+ threadpoolctl==3.5.0
146
+ tokenizers==0.20.3
147
+ tomlkit==0.12.0
148
+ torch==2.5.1
149
+ tqdm==4.67.0
150
+ transformers==4.46.2
151
+ typer==0.12.5
152
+ typing_extensions==4.12.2
153
+ tzdata==2024.2
154
+ urllib3==2.2.3
155
+ uvicorn==0.32.0
156
+ websockets==12.0
157
+ xxhash==3.5.0
158
+ yarl==1.17.1
159
+