Abid Ali Awan commited on
Commit
355b607
·
1 Parent(s): 6d2ef5c

first commit

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. main.py +510 -0
  3. notebook.ipynb +444 -0
  4. requirements.txt +6 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Llama 4 RAG
3
- emoji: 🐢
4
  colorFrom: yellow
5
  colorTo: pink
6
  sdk: gradio
 
1
  ---
2
  title: Llama 4 RAG
3
+ emoji: 🦙
4
  colorFrom: yellow
5
  colorTo: pink
6
  sdk: gradio
main.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ from typing import Dict, List, Optional, Union
4
+
5
+ import gradio as gr
6
+ from groq import Groq
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from langchain_core.prompts import PromptTemplate
10
+ from langchain_core.runnables import RunnablePassthrough
11
+ from langchain_groq import ChatGroq
12
+ from langchain_huggingface import HuggingFaceEmbeddings
13
+ from langchain_core.vectorstores import InMemoryVectorStore
14
+ # Retrieve API key for Groq from the environment variables
15
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
16
+
17
+ # Initialize the Groq client
18
+ client = Groq(api_key=GROQ_API_KEY)
19
+
20
+ # Initialize the LLM
21
+ llm = ChatGroq(model="meta-llama/llama-4-scout-17b-16e-instruct", api_key=GROQ_API_KEY)
22
+
23
+ # Initialize the embedding model
24
+ embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
25
+
26
+ # General constants for the UI
27
+ TITLE = """<h1 align="center">✨ Llama 4 RAG Application</h1>"""
28
+ AVATAR_IMAGES = (
29
+ None,
30
+ "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png",
31
+ )
32
+
33
+ # List of supported text extensions (alphabetically sorted)
34
+ TEXT_EXTENSIONS = [
35
+ ".bat",
36
+ ".c",
37
+ ".cfg",
38
+ ".conf",
39
+ ".cpp",
40
+ ".cs",
41
+ ".css",
42
+ ".docx",
43
+ ".go",
44
+ ".h",
45
+ ".html",
46
+ ".ini",
47
+ ".java",
48
+ ".js",
49
+ ".json",
50
+ ".jsx",
51
+ ".md",
52
+ ".php",
53
+ ".ps1",
54
+ ".py",
55
+ ".rb",
56
+ ".rs",
57
+ ".sh",
58
+ ".toml",
59
+ ".ts",
60
+ ".tsx",
61
+ ".txt",
62
+ ".xml",
63
+ ".yaml",
64
+ ".yml",
65
+ ]
66
+
67
+ # Global variables
68
+ EXTRACTED_FILES = {}
69
+ VECTORSTORE = None
70
+ RAG_CHAIN = None
71
+
72
+ # Initialize the text splitter
73
+ text_splitter = RecursiveCharacterTextSplitter(
74
+ chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n"]
75
+ )
76
+
77
+ # Define the RAG prompt template
78
+ template = """You are an expert assistant tasked with answering questions based on the provided documents.
79
+ Use only the given context to generate your answer.
80
+ If the answer cannot be found in the context, clearly state that you do not know.
81
+ Be detailed and precise in your response, but avoid mentioning or referencing the context itself.
82
+
83
+ Context:
84
+ {context}
85
+
86
+ Question:
87
+ {question}
88
+
89
+ Answer:"""
90
+
91
+ # Create the PromptTemplate
92
+ rag_prompt = PromptTemplate.from_template(template)
93
+
94
+
95
+ def extract_text_from_zip(zip_file_path: str) -> Dict[str, str]:
96
+ """
97
+ Extract text content from files in a ZIP archive.
98
+
99
+ Parameters:
100
+ zip_file_path (str): Path to the ZIP file.
101
+
102
+ Returns:
103
+ Dict[str, str]: Dictionary mapping filenames to their text content.
104
+ """
105
+ text_contents = {}
106
+
107
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
108
+ for file_info in zip_ref.infolist():
109
+ # Skip directories
110
+ if file_info.filename.endswith("/"):
111
+ continue
112
+
113
+ # Skip binary files and focus on text files
114
+ file_ext = os.path.splitext(file_info.filename)[1].lower()
115
+
116
+ if file_ext in TEXT_EXTENSIONS:
117
+ try:
118
+ with zip_ref.open(file_info) as file:
119
+ content = file.read().decode("utf-8", errors="replace")
120
+ text_contents[file_info.filename] = content
121
+ except Exception as e:
122
+ text_contents[file_info.filename] = (
123
+ f"Error extracting file: {str(e)}"
124
+ )
125
+
126
+ return text_contents
127
+
128
+
129
+ def extract_text_from_single_file(file_path: str) -> Dict[str, str]:
130
+ """
131
+ Extract text content from a single file.
132
+
133
+ Parameters:
134
+ file_path (str): Path to the file.
135
+
136
+ Returns:
137
+ Dict[str, str]: Dictionary mapping filename to its text content.
138
+ """
139
+ text_contents = {}
140
+ filename = os.path.basename(file_path)
141
+ file_ext = os.path.splitext(filename)[1].lower()
142
+
143
+ if file_ext in TEXT_EXTENSIONS:
144
+ try:
145
+ with open(file_path, "r", encoding="utf-8", errors="replace") as file:
146
+ content = file.read()
147
+ text_contents[filename] = content
148
+ except Exception as e:
149
+ text_contents[filename] = f"Error reading file: {str(e)}"
150
+
151
+ return text_contents
152
+
153
+
154
+ def upload_files(
155
+ files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]
156
+ ):
157
+ """
158
+ Process uploaded files (ZIP or single text files): extract text content and append a message to the chat.
159
+
160
+ Parameters:
161
+ files (Optional[List[str]]): List of file paths.
162
+ chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
163
+
164
+ Returns:
165
+ List[Union[dict, gr.ChatMessage]]: Updated conversation history.
166
+ """
167
+ global EXTRACTED_FILES, VECTORSTORE, RAG_CHAIN
168
+
169
+ # Handle multiple file uploads
170
+ if len(files) > 1:
171
+ total_files_processed = 0
172
+ total_files_extracted = 0
173
+ file_types = set()
174
+
175
+ # Process each file
176
+ for file in files:
177
+ filename = os.path.basename(file)
178
+ file_ext = os.path.splitext(filename)[1].lower()
179
+
180
+ # Process based on file type
181
+ if file_ext == ".zip":
182
+ extracted_files = extract_text_from_zip(file)
183
+ file_types.add("zip")
184
+ else:
185
+ extracted_files = extract_text_from_single_file(file)
186
+ file_types.add("text")
187
+
188
+ if extracted_files:
189
+ total_files_extracted += len(extracted_files)
190
+ # Store the extracted content in the global variable
191
+ EXTRACTED_FILES[filename] = extracted_files
192
+
193
+ total_files_processed += 1
194
+
195
+ # Create a summary message for multiple files
196
+ file_types_str = (
197
+ "files"
198
+ if len(file_types) > 1
199
+ else ("ZIP files" if "zip" in file_types else "text files")
200
+ )
201
+
202
+ # Create a list of uploaded file names
203
+ file_list = "\n".join([f"- {os.path.basename(file)}" for file in files])
204
+
205
+ chatbot.append(
206
+ gr.ChatMessage(
207
+ role="user",
208
+ content=f"<p>📚 Multiple {file_types_str} uploaded ({total_files_processed} files)</p><p>Extracted {total_files_extracted} text file(s) in total</p><p>Uploaded files:</p><pre>{file_list}</pre>",
209
+ )
210
+ )
211
+
212
+ # Handle single file upload
213
+ elif len(files) == 1:
214
+ file = files[0]
215
+ filename = os.path.basename(file)
216
+ file_ext = os.path.splitext(filename)[1].lower()
217
+
218
+ # Process based on file type
219
+ if file_ext == ".zip":
220
+ extracted_files = extract_text_from_zip(file)
221
+ file_type_msg = "📦 ZIP file"
222
+ else:
223
+ extracted_files = extract_text_from_single_file(file)
224
+ file_type_msg = "📄 File"
225
+
226
+ if not extracted_files:
227
+ chatbot.append(
228
+ gr.ChatMessage(
229
+ role="user",
230
+ content=f"<p>{file_type_msg} uploaded: {filename}, but no text content was found or the file format is not supported.</p>",
231
+ )
232
+ )
233
+ else:
234
+ file_list = "\n".join([f"- {name}" for name in extracted_files.keys()])
235
+ chatbot.append(
236
+ gr.ChatMessage(
237
+ role="user",
238
+ content=f"<p>{file_type_msg} uploaded: {filename}</p><p>Extracted {len(extracted_files)} text file(s):</p><pre>{file_list}</pre>",
239
+ )
240
+ )
241
+
242
+ # Store the extracted content in the global variable
243
+ EXTRACTED_FILES[filename] = extracted_files
244
+
245
+ # Process the extracted files and create vector embeddings
246
+ if EXTRACTED_FILES:
247
+ # Prepare documents for processing
248
+ all_texts = []
249
+ for filename, files in EXTRACTED_FILES.items():
250
+ for file_path, content in files.items():
251
+ all_texts.append(
252
+ {"page_content": content, "metadata": {"source": file_path}}
253
+ )
254
+
255
+ # Create document objects
256
+ from langchain_core.documents import Document
257
+
258
+ documents = [
259
+ Document(page_content=item["page_content"], metadata=item["metadata"])
260
+ for item in all_texts
261
+ ]
262
+
263
+ # Split the documents into chunks
264
+ chunks = text_splitter.split_documents(documents)
265
+
266
+ # Create the vector store
267
+ VECTORSTORE = InMemoryVectorStore.from_documents(
268
+ documents=chunks,
269
+ embedding=embed_model,
270
+ )
271
+
272
+ # Create the retriever
273
+ retriever = VECTORSTORE.as_retriever()
274
+
275
+ # Create the RAG chain
276
+ RAG_CHAIN = (
277
+ {"context": retriever, "question": RunnablePassthrough()}
278
+ | rag_prompt
279
+ | llm
280
+ | StrOutputParser()
281
+ )
282
+
283
+ # Add a confirmation message
284
+ chatbot.append(
285
+ gr.ChatMessage(
286
+ role="assistant",
287
+ content="Documents processed and indexed. You can now ask questions about the content.",
288
+ )
289
+ )
290
+
291
+ return chatbot
292
+
293
+
294
+ def user(text_prompt: str, chatbot: List[gr.ChatMessage]):
295
+ """
296
+ Append a new user text message to the chat history.
297
+
298
+ Parameters:
299
+ text_prompt (str): The input text provided by the user.
300
+ chatbot (List[gr.ChatMessage]): The existing conversation history.
301
+
302
+ Returns:
303
+ Tuple[str, List[gr.ChatMessage]]: A tuple of an empty string (clearing the prompt)
304
+ and the updated conversation history.
305
+ """
306
+ if text_prompt:
307
+ chatbot.append(gr.ChatMessage(role="user", content=text_prompt))
308
+ return "", chatbot
309
+
310
+
311
+ def get_message_content(msg):
312
+ """
313
+ Retrieve the content of a message that can be either a dictionary or a gr.ChatMessage.
314
+
315
+ Parameters:
316
+ msg (Union[dict, gr.ChatMessage]): The message object.
317
+
318
+ Returns:
319
+ str: The textual content of the message.
320
+ """
321
+ if isinstance(msg, dict):
322
+ return msg.get("content", "")
323
+ return msg.content
324
+
325
+
326
+ def process_query(chatbot: List[Union[dict, gr.ChatMessage]]):
327
+ """
328
+ Process the user's query using the RAG pipeline.
329
+
330
+ Parameters:
331
+ chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
332
+
333
+ Returns:
334
+ List[Union[dict, gr.ChatMessage]]: The updated conversation history with the response.
335
+ """
336
+ global RAG_CHAIN
337
+
338
+ if len(chatbot) == 0:
339
+ chatbot.append(
340
+ gr.ChatMessage(
341
+ role="assistant",
342
+ content="Please enter a question or upload documents to start the conversation.",
343
+ )
344
+ )
345
+ return chatbot
346
+
347
+ # Get the last user message as the prompt
348
+ user_messages = [
349
+ msg
350
+ for msg in chatbot
351
+ if (isinstance(msg, dict) and msg.get("role") == "user")
352
+ or (hasattr(msg, "role") and msg.role == "user")
353
+ ]
354
+
355
+ if not user_messages:
356
+ chatbot.append(
357
+ gr.ChatMessage(
358
+ role="assistant",
359
+ content="Please enter a question to start the conversation.",
360
+ )
361
+ )
362
+ return chatbot
363
+
364
+ last_user_msg = user_messages[-1]
365
+ prompt = get_message_content(last_user_msg)
366
+
367
+ # Skip if the last message was about uploading a file
368
+ if (
369
+ "📦 ZIP file uploaded:" in prompt
370
+ or "📄 File uploaded:" in prompt
371
+ or "📚 Multiple files uploaded" in prompt
372
+ ):
373
+ return chatbot
374
+
375
+ # Check if RAG chain is available
376
+ if RAG_CHAIN is None:
377
+ chatbot.append(
378
+ gr.ChatMessage(
379
+ role="assistant",
380
+ content="Please upload documents first to enable question answering.",
381
+ )
382
+ )
383
+ return chatbot
384
+
385
+ # Append a placeholder for the assistant's response
386
+ chatbot.append(gr.ChatMessage(role="assistant", content="Thinking..."))
387
+
388
+ try:
389
+ # Process the query through the RAG chain
390
+ response = RAG_CHAIN.invoke(prompt)
391
+
392
+ # Update the placeholder with the actual response
393
+ chatbot[-1].content = response
394
+ except Exception as e:
395
+ # Handle any errors
396
+ chatbot[-1].content = f"Error processing your query: {str(e)}"
397
+
398
+ return chatbot
399
+
400
+
401
+ def reset_app(chatbot):
402
+ """
403
+ Reset the app by clearing the chat context and removing any uploaded files.
404
+
405
+ Parameters:
406
+ chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
407
+
408
+ Returns:
409
+ List[Union[dict, gr.ChatMessage]]: A fresh conversation history.
410
+ """
411
+ global EXTRACTED_FILES, VECTORSTORE, RAG_CHAIN
412
+
413
+ # Clear the global variables
414
+ EXTRACTED_FILES = {}
415
+ VECTORSTORE = None
416
+ RAG_CHAIN = None
417
+
418
+ # Reset the chatbot with a welcome message
419
+ return [
420
+ gr.ChatMessage(
421
+ role="assistant",
422
+ content="App has been reset. You can start a new conversation or upload new documents.",
423
+ )
424
+ ]
425
+
426
+
427
+ # Define the Gradio UI components
428
+ chatbot_component = gr.Chatbot(
429
+ label="Llama 4 RAG",
430
+ type="messages",
431
+ bubble_full_width=False,
432
+ avatar_images=AVATAR_IMAGES,
433
+ scale=2,
434
+ height=350,
435
+ )
436
+ text_prompt_component = gr.Textbox(
437
+ placeholder="Ask a question about your documents...",
438
+ show_label=False,
439
+ autofocus=True,
440
+ scale=28,
441
+ )
442
+ upload_files_button_component = gr.UploadButton(
443
+ label="Upload",
444
+ file_count="multiple",
445
+ file_types=[".zip", ".docx"] + TEXT_EXTENSIONS,
446
+ scale=1,
447
+ min_width=80,
448
+ )
449
+ send_button_component = gr.Button(
450
+ value="Send", variant="primary", scale=1, min_width=80
451
+ )
452
+ reset_button_component = gr.Button(value="Reset", variant="stop", scale=1, min_width=80)
453
+
454
+ # Define input lists for button chaining
455
+ user_inputs = [text_prompt_component, chatbot_component]
456
+
457
+ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
458
+ gr.HTML(TITLE)
459
+ with gr.Column():
460
+ chatbot_component.render()
461
+ with gr.Row(equal_height=True):
462
+ text_prompt_component.render()
463
+ send_button_component.render()
464
+ upload_files_button_component.render()
465
+ reset_button_component.render()
466
+
467
+ # When the Send button is clicked, first process the user text then process the query
468
+ send_button_component.click(
469
+ fn=user,
470
+ inputs=user_inputs,
471
+ outputs=[text_prompt_component, chatbot_component],
472
+ queue=False,
473
+ ).then(
474
+ fn=process_query,
475
+ inputs=[chatbot_component],
476
+ outputs=[chatbot_component],
477
+ api_name="process_query",
478
+ )
479
+
480
+ # Allow submission using the Enter key
481
+ text_prompt_component.submit(
482
+ fn=user,
483
+ inputs=user_inputs,
484
+ outputs=[text_prompt_component, chatbot_component],
485
+ queue=False,
486
+ ).then(
487
+ fn=process_query,
488
+ inputs=[chatbot_component],
489
+ outputs=[chatbot_component],
490
+ api_name="process_query_submit",
491
+ )
492
+
493
+ # Handle file uploads
494
+ upload_files_button_component.upload(
495
+ fn=upload_files,
496
+ inputs=[upload_files_button_component, chatbot_component],
497
+ outputs=[chatbot_component],
498
+ queue=False,
499
+ )
500
+
501
+ # Handle Reset button clicks
502
+ reset_button_component.click(
503
+ fn=reset_app,
504
+ inputs=[chatbot_component],
505
+ outputs=[chatbot_component],
506
+ queue=False,
507
+ )
508
+
509
+ # Launch the demo interface
510
+ demo.queue().launch()
notebook.ipynb ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "7bfc3afd-0868-4938-9b45-19b2cba1a149",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Setting Up"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "d102db92-a346-447d-8c61-3be8292adec7",
15
+ "metadata": {
16
+ "executionCancelledAt": null,
17
+ "executionTime": 22644,
18
+ "lastExecutedAt": 1744298182897,
19
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
20
+ "lastScheduledRunId": null,
21
+ "lastSuccessfullyExecutedCode": "%%capture\n%pip install langchain\n%pip install langchain-community \n%pip install langchainhub \n%pip install langchain-chroma \n%pip install langchain-groq\n%pip install langchain-huggingface\n%pip install unstructured[docx]"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "%%capture\n",
26
+ "%pip install langchain\n",
27
+ "%pip install langchain-community \n",
28
+ "%pip install langchainhub \n",
29
+ "%pip install langchain-chroma \n",
30
+ "%pip install langchain-groq\n",
31
+ "%pip install langchain-huggingface\n",
32
+ "%pip install unstructured[docx]"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "markdown",
37
+ "id": "356e4c03-5642-4d21-8ee4-bc32b14e98ec",
38
+ "metadata": {},
39
+ "source": [
40
+ "## Groq Python API"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "id": "4e8b89b9-f5bc-466b-a30f-db8e97828826",
47
+ "metadata": {
48
+ "executionCancelledAt": null,
49
+ "executionTime": 3176,
50
+ "lastExecutedAt": 1744298186074,
51
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
52
+ "lastScheduledRunId": null,
53
+ "lastSuccessfullyExecutedCode": "import os\nfrom groq import Groq\n\ngroq_api_key = os.environ.get(\"GROQ_API_KEY\")\n\nclient = Groq(\n api_key=groq_api_key,\n)\n\n\nchat_streaming = client.chat.completions.create(\n messages=[\n {\"role\": \"system\", \"content\": \"You are a professional Data Engineer.\"},\n {\"role\": \"user\", \"content\": \"Can you explain how the data lake works?\"},\n ],\n model=\"meta-llama/llama-4-scout-17b-16e-instruct\",\n temperature=0.3,\n max_tokens=1200,\n top_p=1,\n stop=None,\n stream=True,\n)\n\nfor chunk in chat_streaming:\n print(chunk.choices[0].delta.content, end=\"\")",
54
+ "outputsMetadata": {
55
+ "0": {
56
+ "height": 469,
57
+ "type": "stream"
58
+ }
59
+ }
60
+ },
61
+ "outputs": [
62
+ {
63
+ "name": "stdout",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "As a Data Engineer, I'd be happy to explain how a data lake works.\n",
67
+ "\n",
68
+ "**What is a Data Lake?**\n",
69
+ "\n",
70
+ "A data lake is a centralized repository that stores raw, unprocessed data in its native format. It's a scalable and flexible storage solution that allows you to store and process large amounts of structured, semi-structured, and unstructured data. The data lake is often used as a precursor to data warehousing, data analytics, and machine learning.\n",
71
+ "\n",
72
+ "**Key Components of a Data Lake**\n",
73
+ "\n",
74
+ "1. **Storage**: The storage layer is the foundation of a data lake. It's typically a distributed file system, such as Hadoop Distributed File System (HDFS), Amazon S3, Azure Data Lake Storage (ADLS), or Google Cloud Storage (GCS). This layer stores raw data in its native format, without any transformation or processing.\n",
75
+ "2. **Data Ingestion**: Data ingestion is the process of collecting data from various sources and loading it into the data lake. This can be done through various methods, such as batch processing, streaming, or manual uploads.\n",
76
+ "3. **Data Processing**: The data processing layer is responsible for transforming and processing the raw data into a usable format. This can be done using various processing frameworks, such as Apache Spark, Apache Flink, or Azure Databricks.\n",
77
+ "4. **Metadata Management**: Metadata management is critical in a data lake, as it provides context and meaning to the stored data. This includes information such as data schema, data lineage, and data quality.\n",
78
+ "\n",
79
+ "**How a Data Lake Works**\n",
80
+ "\n",
81
+ "Here's a step-by-step overview of how a data lake works:\n",
82
+ "\n",
83
+ "1. **Data Ingestion**: Data is collected from various sources, such as databases, applications, IoT devices, or social media platforms.\n",
84
+ "2. **Data Landing**: The ingested data is landed in the data lake's storage layer, where it's stored in its native format.\n",
85
+ "3. **Data Processing**: The raw data is processed and transformed into a usable format using various processing frameworks.\n",
86
+ "4. **Data Cataloging**: The processed data is cataloged, which involves creating metadata that describes the data, such as its schema, format, and quality.\n",
87
+ "5. **Data Analysis**: The processed and cataloged data is made available for analysis, reporting, and machine learning.\n",
88
+ "\n",
89
+ "**Benefits of a Data Lake**\n",
90
+ "\n",
91
+ "The data lake offers several benefits, including:\n",
92
+ "\n",
93
+ "1. **Scalability**: Data lakes can store large amounts of data and scale horizontally as needed.\n",
94
+ "2. **Flexibility**: Data lakes can store various types of data, including structured, semi-structured, and unstructured data.\n",
95
+ "3. **Cost-Effective**: Data lakes can be more cost-effective than traditional data warehousing solutions.\n",
96
+ "4. **Improved Data Quality**: Data lakes provide a single source of truth for data, which improves data quality and reduces data duplication.\n",
97
+ "\n",
98
+ "**Common Use Cases for a Data Lake**\n",
99
+ "\n",
100
+ "1. **Data Warehousing**: Data lakes can be used as a precursor to data warehousing, providing a centralized repository for data before it's loaded into a data warehouse.\n",
101
+ "2. **Big Data Analytics**: Data lakes can be used for big data analytics, providing a scalable and flexible storage solution for large datasets.\n",
102
+ "3. **Machine Learning**: Data lakes can be used as a data source for machine learning models, providing a large and diverse dataset for training and testing.\n",
103
+ "4. **Data Archiving**: Data lakes can be used for data archiving, providing a cost-effective solution for storing historical data.\n",
104
+ "\n",
105
+ "I hope this helps! Do you have any specific questions about data lakes or would you like me to elaborate on any of these points?None"
106
+ ]
107
+ }
108
+ ],
109
+ "source": [
110
+ "import os\n",
111
+ "from groq import Groq\n",
112
+ "\n",
113
+ "groq_api_key = os.environ.get(\"GROQ_API_KEY\")\n",
114
+ "\n",
115
+ "client = Groq(\n",
116
+ " api_key=groq_api_key,\n",
117
+ ")\n",
118
+ "\n",
119
+ "\n",
120
+ "chat_streaming = client.chat.completions.create(\n",
121
+ " messages=[\n",
122
+ " {\"role\": \"system\", \"content\": \"You are a professional Data Engineer.\"},\n",
123
+ " {\"role\": \"user\", \"content\": \"Can you explain how the data lake works?\"},\n",
124
+ " ],\n",
125
+ " model=\"meta-llama/llama-4-scout-17b-16e-instruct\",\n",
126
+ " temperature=0.3,\n",
127
+ " max_tokens=1200,\n",
128
+ " top_p=1,\n",
129
+ " stop=None,\n",
130
+ " stream=True,\n",
131
+ ")\n",
132
+ "\n",
133
+ "for chunk in chat_streaming:\n",
134
+ " print(chunk.choices[0].delta.content, end=\"\")"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "markdown",
139
+ "id": "b1849fe7-4641-44c4-a91f-27976d2c1918",
140
+ "metadata": {},
141
+ "source": [
142
+ "## Initiating LLM and Embedding"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": 3,
148
+ "id": "65583358-ce96-4657-9b4b-fabc5a2f195e",
149
+ "metadata": {
150
+ "executionCancelledAt": null,
151
+ "executionTime": 560,
152
+ "lastExecutedAt": 1744298186634,
153
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
154
+ "lastScheduledRunId": null,
155
+ "lastSuccessfullyExecutedCode": "from langchain_groq import ChatGroq\n\nllm = ChatGroq(model=\"meta-llama/llama-4-scout-17b-16e-instruct\", api_key=groq_api_key)"
156
+ },
157
+ "outputs": [],
158
+ "source": [
159
+ "from langchain_groq import ChatGroq\n",
160
+ "\n",
161
+ "llm = ChatGroq(model=\"meta-llama/llama-4-scout-17b-16e-instruct\", api_key=groq_api_key)"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 11,
167
+ "id": "70675683-6a4f-4331-b8b5-6c4e348fa389",
168
+ "metadata": {
169
+ "executionCancelledAt": null,
170
+ "executionTime": 661,
171
+ "lastExecutedAt": 1744298599903,
172
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
173
+ "lastScheduledRunId": null,
174
+ "lastSuccessfullyExecutedCode": "from langchain_huggingface import HuggingFaceEmbeddings\nembed_model = HuggingFaceEmbeddings(model_name=\"mixedbread-ai/mxbai-embed-large-v1\")",
175
+ "outputsMetadata": {
176
+ "0": {
177
+ "height": 437,
178
+ "type": "stream"
179
+ }
180
+ }
181
+ },
182
+ "outputs": [],
183
+ "source": [
184
+ "from langchain_huggingface import HuggingFaceEmbeddings\n",
185
+ "embed_model = HuggingFaceEmbeddings(model_name=\"mixedbread-ai/mxbai-embed-large-v1\")"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "markdown",
190
+ "id": "ff2b277e-dc31-4801-bd05-ffda3265523b",
191
+ "metadata": {},
192
+ "source": [
193
+ "## Loading and spliting the data"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 5,
199
+ "id": "12390e24-2c8f-4690-8060-69eea3c224a0",
200
+ "metadata": {
201
+ "executionCancelledAt": null,
202
+ "executionTime": 1932,
203
+ "lastExecutedAt": 1744298196669,
204
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
205
+ "lastScheduledRunId": null,
206
+ "lastSuccessfullyExecutedCode": "from langchain_community.document_loaders import DirectoryLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\n# Initialize the text splitter\ntext_splitter = RecursiveCharacterTextSplitter(\n chunk_size=500,\n chunk_overlap=50,\n separators=[\"\\n\\n\", \"\\n\"]\n)\n\n# Load the .docx files\nloader = DirectoryLoader(\"./\", glob=\"*.docx\", use_multithreading=True)\ndocuments = loader.load()\n\n# Split the documents into chunks\nchunks = text_splitter.split_documents(documents)\n\n# Print the number of chunks\nprint(len(chunks))\n"
207
+ },
208
+ "outputs": [
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ "29\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "from langchain_community.document_loaders import DirectoryLoader\n",
219
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
220
+ "\n",
221
+ "# Initialize the text splitter\n",
222
+ "text_splitter = RecursiveCharacterTextSplitter(\n",
223
+ " chunk_size=1000,\n",
224
+ " chunk_overlap=100,\n",
225
+ " separators=[\"\\n\\n\", \"\\n\"]\n",
226
+ ")\n",
227
+ "\n",
228
+ "# Load the .docx files\n",
229
+ "loader = DirectoryLoader(\"./\", glob=\"*.docx\", use_multithreading=True)\n",
230
+ "documents = loader.load()\n",
231
+ "\n",
232
+ "# Split the documents into chunks\n",
233
+ "chunks = text_splitter.split_documents(documents)\n",
234
+ "\n",
235
+ "# Print the number of chunks\n",
236
+ "print(len(chunks))\n"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "markdown",
241
+ "id": "26085a16-42c8-4c6c-958f-9c1e2cc62b23",
242
+ "metadata": {},
243
+ "source": [
244
+ "## Creating the Vector Store"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 6,
250
+ "id": "b27426d6-a218-4a44-9067-a9d4509e59c4",
251
+ "metadata": {
252
+ "executionCancelledAt": null,
253
+ "executionTime": 8773,
254
+ "lastExecutedAt": 1744298205442,
255
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
256
+ "lastScheduledRunId": null,
257
+ "lastSuccessfullyExecutedCode": "from langchain_chroma import Chroma\n\nvectorstore = Chroma.from_documents(\n documents=chunks,\n embedding=embed_model,\n persist_directory=\"./Vectordb\",\n)",
258
+ "outputsMetadata": {
259
+ "0": {
260
+ "height": 101,
261
+ "type": "stream"
262
+ }
263
+ }
264
+ },
265
+ "outputs": [],
266
+ "source": [
267
+ "from langchain_chroma import Chroma\n",
268
+ "\n",
269
+ "vectorstore = Chroma.from_documents(\n",
270
+ " documents=chunks,\n",
271
+ " embedding=embed_model,\n",
272
+ " persist_directory=\"./Vectordb\",\n",
273
+ ")"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 9,
279
+ "id": "633bbbfa-36ac-426e-8599-0f3bfb3b80ea",
280
+ "metadata": {
281
+ "executionCancelledAt": null,
282
+ "executionTime": 166,
283
+ "lastExecutedAt": 1744298366376,
284
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
285
+ "lastScheduledRunId": null,
286
+ "lastSuccessfullyExecutedCode": "query = \"What this tutorial about?\"\ndocs = vectorstore.similarity_search(query)\nprint(docs[0].page_content)",
287
+ "outputsMetadata": {
288
+ "0": {
289
+ "height": 122,
290
+ "type": "stream"
291
+ }
292
+ }
293
+ },
294
+ "outputs": [
295
+ {
296
+ "name": "stdout",
297
+ "output_type": "stream",
298
+ "text": [
299
+ "Learn how to Fine-tune Stable Diffusion XL with DreamBooth and LoRA on your personal images. \n",
300
+ "\n",
301
+ "Let’s try another prompt:\n",
302
+ "\n",
303
+ "Prompt:\n"
304
+ ]
305
+ }
306
+ ],
307
+ "source": [
308
+ "query = \"What this tutorial about?\"\n",
309
+ "docs = vectorstore.similarity_search(query)\n",
310
+ "print(docs[0].page_content)"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "markdown",
315
+ "id": "304a6177-9f74-40a5-bac9-eb2df32a8bff",
316
+ "metadata": {},
317
+ "source": [
318
+ "## Creating the RAG pipeline"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 12,
324
+ "id": "5e3cd149-3009-4a12-98ed-8873d3bf9ab5",
325
+ "metadata": {
326
+ "executionCancelledAt": null,
327
+ "executionTime": 49,
328
+ "lastExecutedAt": 1744298865976,
329
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
330
+ "lastScheduledRunId": null,
331
+ "lastSuccessfullyExecutedCode": "# Create retriever\nretriever = vectorstore.as_retriever()\n\n# Import PromptTemplate\nfrom langchain_core.prompts import PromptTemplate\n\n# Define a clearer, more professional prompt template\ntemplate = \"\"\"You are an expert assistant tasked with answering questions based on the provided documents.\nUse only the given context to generate your answer.\nIf the answer cannot be found in the context, clearly state that you do not know.\nBe detailed and precise in your response, but avoid mentioning or referencing the context itself.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\nAnswer:\"\"\"\n\n# Create the PromptTemplate\nrag_prompt = PromptTemplate.from_template(template)\n"
332
+ },
333
+ "outputs": [],
334
+ "source": [
335
+ "# Create retriever\n",
336
+ "retriever = vectorstore.as_retriever()\n",
337
+ "\n",
338
+ "# Import PromptTemplate\n",
339
+ "from langchain_core.prompts import PromptTemplate\n",
340
+ "\n",
341
+ "# Define a clearer, more professional prompt template\n",
342
+ "template = \"\"\"You are an expert assistant tasked with answering questions based on the provided documents.\n",
343
+ "Use only the given context to generate your answer.\n",
344
+ "If the answer cannot be found in the context, clearly state that you do not know.\n",
345
+ "Be detailed and precise in your response, but avoid mentioning or referencing the context itself.\n",
346
+ "\n",
347
+ "Context:\n",
348
+ "{context}\n",
349
+ "\n",
350
+ "Question:\n",
351
+ "{question}\n",
352
+ "\n",
353
+ "Answer:\"\"\"\n",
354
+ "\n",
355
+ "# Create the PromptTemplate\n",
356
+ "rag_prompt = PromptTemplate.from_template(template)\n"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": 13,
362
+ "id": "889685f6-3e5e-4abb-8391-084bdb6b7d4d",
363
+ "metadata": {
364
+ "executionCancelledAt": null,
365
+ "executionTime": 48,
366
+ "lastExecutedAt": 1744298875804,
367
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
368
+ "lastScheduledRunId": null,
369
+ "lastSuccessfullyExecutedCode": "from langchain_core.output_parsers import StrOutputParser\nfrom langchain_core.runnables import RunnablePassthrough\n\nrag_chain = (\n {\"context\": retriever, \"question\": RunnablePassthrough()}\n | rag_prompt\n | llm\n | StrOutputParser()\n)"
370
+ },
371
+ "outputs": [],
372
+ "source": [
373
+ "from langchain_core.output_parsers import StrOutputParser\n",
374
+ "from langchain_core.runnables import RunnablePassthrough\n",
375
+ "\n",
376
+ "rag_chain = (\n",
377
+ " {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
378
+ " | rag_prompt\n",
379
+ " | llm\n",
380
+ " | StrOutputParser()\n",
381
+ ")"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 14,
387
+ "id": "ee406271-90d6-453b-a2aa-4753f08d30e5",
388
+ "metadata": {
389
+ "executionCancelledAt": null,
390
+ "executionTime": 888,
391
+ "lastExecutedAt": 1744298897317,
392
+ "lastExecutedByKernel": "b0791251-777d-414c-b5cc-636b4f317d9a",
393
+ "lastScheduledRunId": null,
394
+ "lastSuccessfullyExecutedCode": "from IPython.display import display, Markdown\n\nresponse = rag_chain.invoke(\"What this tutorial about?\")\nMarkdown(response)"
395
+ },
396
+ "outputs": [
397
+ {
398
+ "data": {
399
+ "text/markdown": [
400
+ "This tutorial is about setting up and using the Janus project, specifically Janus Pro, a multimodal model that can understand images and generate images from text prompts, and building a local solution to use the model privately on a laptop GPU. It covers learning about the Janus Series, setting up the Janus project, building a Docker container to run the model locally, and testing its capabilities with various image and text prompts."
401
+ ],
402
+ "text/plain": [
403
+ "<IPython.core.display.Markdown object>"
404
+ ]
405
+ },
406
+ "execution_count": 14,
407
+ "metadata": {},
408
+ "output_type": "execute_result"
409
+ }
410
+ ],
411
+ "source": [
412
+ "from IPython.display import display, Markdown\n",
413
+ "\n",
414
+ "response = rag_chain.invoke(\"What this tutorial about?\")\n",
415
+ "Markdown(response)"
416
+ ]
417
+ }
418
+ ],
419
+ "metadata": {
420
+ "colab": {
421
+ "name": "Welcome to DataCamp Workspaces.ipynb",
422
+ "provenance": []
423
+ },
424
+ "kernelspec": {
425
+ "display_name": "Python 3 (ipykernel)",
426
+ "language": "python",
427
+ "name": "python3"
428
+ },
429
+ "language_info": {
430
+ "codemirror_mode": {
431
+ "name": "ipython",
432
+ "version": 3
433
+ },
434
+ "file_extension": ".py",
435
+ "mimetype": "text/x-python",
436
+ "name": "python",
437
+ "nbconvert_exporter": "python",
438
+ "pygments_lexer": "ipython3",
439
+ "version": "3.12.3"
440
+ }
441
+ },
442
+ "nbformat": 4,
443
+ "nbformat_minor": 5
444
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchainhub
4
+ langchain-groq
5
+ langchain-huggingface
6
+ unstructured[docx]