Spaces:

kltn20133118
/

demo_obsei

Sleeping

App Files Files Community

kltn20133118 commited on Dec 9, 2024

Commit

9fab0e0

verified ·

1 Parent(s): a0dda64

Update main.py

Browse files

Files changed (1) hide show

main.py +133 -3

main.py CHANGED Viewed

@@ -2,6 +2,12 @@ from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import pipe_line_obsei
 # Định nghĩa model request body
 class URLProcessRequest(BaseModel):
@@ -9,12 +15,74 @@ class URLProcessRequest(BaseModel):
     primary_db: str  # Tên database chính
     primary_collection: str  # Tên collection chính
     backup_db: str  # Tên database dự phòng
-    backup_collection: str  # Tên collection dự phòng
 # Khởi tạo FastAPI
 app = FastAPI(
-    title="ChatBot HCMUTE",
-    description="Python ChatBot is intended for use in the topic Customizing chatbots. With the construction of 2 students Vo Nhu Y - 20133118 and Nguyen Quang Phuc 20133080",
     swagger_ui_parameters={"syntaxHighlight.theme": "obsidian"},
     version="1.0.0",
     contact={
@@ -75,3 +143,65 @@ async def process_url_api(request: URLProcessRequest):
             detail=f"An error occurred while processing the request: {str(e)}"
         )

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import pipe_line_obsei
+import support_function as sf
+from typing import Dict, Any
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 # Định nghĩa model request body
 class URLProcessRequest(BaseModel):
     primary_db: str  # Tên database chính
     primary_collection: str  # Tên collection chính
     backup_db: str  # Tên database dự phòng
+    backup_collection: str
+class SchemaReq(BaseModel):
+    schema: dict[str, Any]
+    url: str
+class SchemaReqMain(BaseModel):
+    schema: dict[str, Any]
+    url: str
+    scroll: bool
+    category: str
+class TimeProcessRequest(BaseModel):
+    target_time : str  # Thời gian cần chuẩn hóa
+async def return_json(schema: Dict[str, Any], url: str) -> Dict[str, Any]:
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+    async with AsyncWebCrawler(always_by_pass_cache=True) as crawler:
+        result = await crawler.arun(
+            url=url,
+            exclude_external_links=True,
+            bypass_cache=True,
+            verbose=False,
+            warning=False,
+            extraction_strategy=extraction_strategy
+        )
+        # Parse nội dung đã crawl thành JSON
+        news_teasers = json.loads(result.extracted_content)
+        return news_teasers
+async def return_json_main (schema: dict[str, Any], url: str, scroll: bool, category: str):
+    js_code = """
+    (async function() {
+        let lastHeight = 0;
+        while (document.body.scrollHeight !== lastHeight) {
+            lastHeight = document.body.scrollHeight;
+            window.scrollTo(0, document.body.scrollHeight);
+            await new Promise(resolve => setTimeout(resolve, 1000)); // Đợi 1 giây giữa mỗi lần cuộn
+        }
+    })();
+""" if scroll else ""
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+    async with AsyncWebCrawler(always_bypass_cache=True) as crawler:
+        result = await crawler.arun(
+            url=url,
+            extraction_strategy=extraction_strategy,
+            bypass_cache=True,
+            warning=False,
+            js_code=js_code
+        )
+        await asyncio.sleep(5)
+        news_teasers = json.loads(result.extracted_content)
+        if isinstance(news_teasers, list):
+            for item in news_teasers:
+                item["category"] = category
+        else:
+            print("Unexpected data format:", news_teasers)
+            return []
+        return news_teasers
 # Khởi tạo FastAPI
 app = FastAPI(
+    title="Obsei",
+    description="API để xử lý dữ liệu từ các nguồn web",
     swagger_ui_parameters={"syntaxHighlight.theme": "obsidian"},
     version="1.0.0",
     contact={
             detail=f"An error occurred while processing the request: {str(e)}"
         )
+# API cho hàm chuẩn hóa thời gian
+@app.post("/api/v1/obsei/chuan_hoa_time/")  # endpoint chuẩn hóa thời gian
+async def chuan_hoa_time_api(request: TimeProcessRequest):
+    """
+    API nhận chuỗi thời gian và chuẩn hóa thời gian theo định dạng mong muốn.
+    """
+    try:
+        time_str = request.target_time # Bạn có thể thay đổi thuộc tính phù hợp
+        formatted_time = sf.chuan_hoa_time(time_str)
+        return {
+            "formatted_time": formatted_time
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"An error occurred while processing the time: {str(e)}"
+        )
+# API cho crawling với schema
+@app.post("/api/v1/crawl/")
+async def crawl_url(request: SchemaReq):
+    """
+    API nhận request body chứa schema và URL, sau đó crawl và trả về dữ liệu.
+    """
+    try:
+        # Lấy schema và URL từ request body
+        schema = request.schema
+        url = request.url
+        # Gọi hàm `return_json` để lấy dữ liệu đã crawl
+        data = await return_json(schema, url)  # Gọi async function mà không cần asyncio.run()
+        return {"status": "success", "data": data}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"An error occurred while processing the request: {str(e)}"
+        )
+@app.post("/api/v1/crawl_main/")
+async def crawl_url(request: SchemaReqMain):
+    """
+    API nhận request body chứa schema và URL, sau đó crawl và trả về dữ liệu.
+    """
+    try:
+        # Lấy schema và URL từ request body
+        schema = request.schema
+        url = request.url
+        scroll = request.scroll
+        category = request.category
+        # Gọi hàm `return_json` để lấy dữ liệu đã crawl
+        data = await return_json_main(schema, url,scroll,category )  # Gọi async function mà không cần asyncio.run()
+        return {"status": "success", "data": data}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"An error occurred while processing the request: {str(e)}"
+        )