CT / app.py
khushidhar1210's picture
neew code
44fdc1e verified
import geopandas as gpd
import sqlite3
import pandas as pd
import torch
import faiss
import numpy as np
import os
from shapely.geometry import shape
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import streamlit as st
# Set the environment variables for GPU usage in Hugging Face
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Hugging Face uses GPU 0 by default
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.write(f"Using device: {device}")
# Step 1: Load and Process Floodland Data
conn = sqlite3.connect('NY.db')
cursor = conn.cursor()
# Load shapefile
gdf = gpd.read_file('S_FLD_HAZ_AR.shp')
# Validate geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
gdf = gdf.dropna(subset=['geometry'])
# Convert CRS to UTM Zone 18N (New York)
gdf = gdf.to_crs(epsg=32618)
# Calculate acreage (1 square meter = 0.000247105 acres)
gdf['acreage'] = gdf.geometry.area * 0.000247105
# Define flood-prone zones and calculate usable area
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
gdf['usable_area'] = gdf.apply(lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1)
# Convert geometry to WKT format
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)
# Step 2: Load Embedding Model (Sentence-Transformer)
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Convert floodland descriptions into text
gdf['text'] = gdf.apply(
lambda row: f"Flood Zone: {row['FLD_ZONE']}, Subtype: {row['ZONE_SUBTY']}, Acreage: {row['acreage']:.2f} acres, Usable Area: {row['usable_area']:.2f} acres",
axis=1
)
# Generate text embeddings
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)
# Create FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)
# Store embeddings in DataFrame
gdf['embedding'] = list(embeddings)
# Step 3: Load LLaMA Model for Summarization
llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16, device_map="auto")
# Function to Generate Summary using LLaMA
def llama_summarize(text, total_acreage, usable_acreage, location_data, max_length=250):
input_text = f"""
**Total Land Area**: {total_acreage:.2f} acres
**Usable Area**: {usable_acreage:.2f} acres
**Flood-prone Zones**:
{location_data}
Summarization in sentence
"""
inputs = tokenizer(input_text, return_tensors="pt").to(device)
# Calculate max_new_tokens based on input size
input_length = inputs['input_ids'].shape[1]
max_new_tokens = max_length - input_length
if max_new_tokens <= 0:
max_new_tokens = 200 # Ensure at least a few tokens are generated
with torch.no_grad():
output_tokens = model.generate(
**inputs,
max_new_tokens=max_new_tokens, # Use max_new_tokens to control the generated length
temperature=0.7,
top_k=50,
top_p=0.9,
repetition_penalty=1.2
)
summary = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
return summary
# Step 4: RAG Summarization Function
def rag_summarize(query, gdf, index, k=5):
query = query.lower().strip()
query_embedding = embedder.encode([query])[0]
# Retrieve top-k relevant documents
distances, indices = index.search(np.array([query_embedding]), k)
retrieved_docs = gdf.iloc[indices[0]]
# Aggregate data
total_acreage = retrieved_docs['acreage'].sum()
usable_acreage = retrieved_docs['usable_area'].sum()
location_data = "\n".join([
f"- **Flood Zone**: {row['FLD_ZONE']}, **Subtype**: {row['ZONE_SUBTY']}, "
f"**Acreage**: {row['acreage']:.2f}, **Usable Area**: {row['usable_area']:.2f} acres"
for _, row in retrieved_docs.iterrows()
])
# Use LLaMA for summarization
summary = llama_summarize(query, total_acreage, usable_acreage, location_data)
return summary
# Streamlit Interface
st.title("๐ŸŒŠ Floodland Summary Bot (Powered by LLaMA-2)")
# Input for location
user_input = st.text_input("Enter a location (e.g., New York)", "")
# When the user inputs a query, display the summary
if user_input:
query = user_input.lower().strip()
summary = rag_summarize(query, gdf, index)
st.write(summary)