import streamlit as st import pandas as pd import joblib from preprocessText import preprocess from apiSearch import get_metadata,get_trending_videos import base64 import requests import matplotlib.pyplot as plt import numpy as np import seaborn as sns # Load the model model = joblib.load('85pct(new).pkl') # Define the categories categories = { 'Film & Animation': 1, 'Autos & Vehicles': 2, 'Music': 10, 'Pets & Animals': 15, 'Sports' : 17, 'Short Movies' : 18, 'Travel & Events' : 19, 'Gaming' : 20, 'Videoblogging' : 21, 'People & Blogs' : 22, 'Comedy' : 23, 'Entertainment' : 24, 'News & Politics' : 25, 'Howto & Style' : 26, 'Education' : 27, 'Science & Technology' : 28, 'Nonprofits & Activism' : 29 } # Create the Streamlit web application def main(): st.set_page_config(layout="wide") st.markdown( f""" """, unsafe_allow_html=True ) st.markdown("

",unsafe_allow_html=True) st.markdown("

YouTube Trend Prediction

", unsafe_allow_html=True) #https://www.freepnglogos.com/uploads/youtube-play-red-logo-png-transparent-background-6.png # st.write("Enter the video details below:") # Define a boolean flag variable to track prediction status prediction_done = False tab1, tab2, tab3 = st.tabs(["Predict", "Trending","Visualize"]) # Input fields with tab1: with st.container(): col1, col2, col3 = st.columns(3) getTitle, getDuration, getCategory = "", 0.00, 1 getThumbnailUrl = "" with col1: url = st.text_input("URL",placeholder="Enter a video url") if url: metadata = get_metadata(url) if not metadata.empty: getTitle = metadata['title'].iloc[0] getDuration = metadata['duration'].iloc[0] category_id = metadata['category_id'].iloc[0] getThumbnailUrl = metadata['thumbnail_link'].iloc[0] getCategory = int(category_id) if getThumbnailUrl is not None: picture = get_picture_from_url(getThumbnailUrl) if picture: st.image(picture, caption='Thumbnail captured',width = 400, channels="BGR") with col2: title = st.text_input("Title", placeholder="Enter a video title",value=getTitle) duration = st.number_input("Duration (in seconds)", min_value=0.0, value=getDuration) category = st.selectbox("Category", list(categories.keys()), index=list(categories.values()).index(getCategory)) with col3: picture = st.file_uploader("Upload Picture", type=["jpg", "jpeg", "png"]) if picture is not None: st.picture(picture,caption='Thumbnail Uploaded',width = 400, channels="BGR") # Convert category to category ID categoryId = categories[category] if st.button("Predict"): # Perform prediction if title is None or title.strip() == "" and duration == 0: st.warning("Please enter a title and duration.") else: if title is None or title.strip() == "": st.warning("Please enter a title") if duration == 0: st.warning("Please enter a duration.") else: prediction = predict_trend(title, duration, categoryId) if prediction[0] == 1: st.success("This video is predicted to be a trend!") st.markdown("![Alt Text](https://media.tenor.com/Cyi2zT7wcmcAAAAj/pentol-gif-eak.gif)") else: st.info("This video is predicted not to be a trend.") st.markdown("![Alt Text](https://media.tenor.com/VYKtkKnHaUcAAAAj/quby-cute.gif)") with tab2: country_code = st.selectbox("Select Country Code", ['US', 'CA', 'GB','DE', 'FR', 'RU', 'BR','IN','MY','SG','JP','KR']) with st.container(): st.write("Top 10 Trending Video") df = get_trending_videos(country_code) st.dataframe(df) if df is not None: # Display video titles selected_video_title = st.selectbox("Select a Video", df['title']) selected_video = df[df['title'] == selected_video_title].iloc[0] col4,col5 = st.columns(2) with col4: if selected_video is not None: image = get_picture_from_url(selected_video['thumbnail_link']) if image: st.image(image, caption='Thumbnail captured',width = 400, channels="BGR") with col5: st.write("Title:", selected_video['title']) category_name = next((key for key, value in categories.items() if value == selected_video['category_id']), 'Unknown Category') st.write("Category:", category_name) st.write("Duration:", selected_video['duration']) else: st.error('Failed to retrieve trending videos.') with tab3: with st.container(): col6,col7 = st.columns(2) with col6: show_top_category() with col7: show_top_duration() with st.container(): col8,col9 = st.columns(2) with col8: show_top_title() with col9: show_top_titleLength() def get_picture_from_url(url): try: response = requests.get(url) image_data = response.content return image_data except: return None def show_top_category(): topCategory = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topCategory.csv') # Sort the DataFrame in ascending order based on predicted_prob column topCategory_sorted = topCategory.sort_values('predicted_prob') # Add a 'rank' column representing the ascending order of predicted_prob topCategory_sorted['rank'] = range(1, len(topCategory_sorted) + 1) # Map category_id to category name using the categories dictionary topCategory_sorted['category_name'] = topCategory_sorted['category_id'].map(lambda x: next((key for key, value in categories.items() if value == x), 'Unknown Category')) # Set a color palette for the plot color_palette = sns.color_palette('Set2', len(topCategory_sorted['category_id'].unique())) # Create a bar plot based on rank and predicted_prob columns with different colors for each category_name fig, ax = plt.subplots(figsize=(8, 5)) sns.barplot(data=topCategory_sorted, x='rank', y='predicted_prob', hue='category_name', palette=color_palette) plt.xlabel('Rank') plt.ylabel('Predicted Probability') plt.title('Top Categories') # Display the legend and the plot in Streamlit st.pyplot(fig) def show_top_duration(): topDuration = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topDuration.csv') topDuration_sorted = topDuration.sort_values('predicted_prob', ascending=False) # Set the duration as the x-axis and predicted_prob as the y-axis x = topDuration_sorted['duration'] y = topDuration_sorted['predicted_prob'] # Create a scatter plot of duration vs predicted_prob using seaborn plt.figure(figsize=(8, 5)) # Adjust the figure size here (width, height) sns.scatterplot(x=x, y=y, palette='coolwarm') # Use coolwarm palette for colorful plot plt.xlabel('Duration') plt.ylabel('Predicted Probability') plt.title('Top Durations') # Display the plot in Streamlit st.pyplot(plt) def show_top_title(): topTitle = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topTitle.csv') # Sort the DataFrame in ascending order based on predicted_prob column topTitle_sorted = topTitle.sort_values('Importance Score') sns.set(style="whitegrid") plt.figure(figsize=(8, 6)) sns.barplot(x='Importance Score', y='Feature', data=topTitle_sorted, palette="rocket") plt.xlabel('Importance Score', fontsize=12) plt.ylabel('Feature', fontsize=12) plt.title('Top Title Features', fontsize=14) plt.tight_layout() st.pyplot(plt) def round_interval(interval_str): start, end = map(float, interval_str.strip('()[]').split(',')) return f"({int(start)}, {int(end)})" def show_top_titleLength(): topTitleLength = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\topTitleLength.csv') title_length_ranges = topTitleLength['titleLength'] predicted_probs = topTitleLength['predicted_prob'] rounded_ranges = [round_interval(range_val) for range_val in title_length_ranges] # Set the style of the plot sns.set(style='whitegrid') # Plot the graph using Seaborn plt.figure(figsize=(10, 6)) sns.barplot(x=rounded_ranges, y=predicted_probs) plt.xlabel('Title Length Range') plt.ylabel('Predicted Probability') plt.title('Top 5 Ranges for Title Length vs. Predicted Probability') plt.xticks(rotation=45) plt.show() st.pyplot(plt) # Function to make predictions def predict_trend(title, duration, category_id): duration = str(duration) category_id = int(category_id) clean_new_title = preprocess(title) # Join the preprocessed words back into a string clean_new_title_str = ' '.join(clean_new_title) # Prepare the input data data = { 'cleanTitle': [clean_new_title_str], 'titleLength' : [len(title)], 'categoryId': [category_id], 'duration': [duration] } data = pd.DataFrame(data) data['categoryId'] = data['categoryId'].astype('category') data['duration'] = data['duration'].astype('float64') # Make the prediction print(model.predict_proba(data)) prediction = model.predict(data) return prediction if __name__ == "__main__": main()