File size: 4,588 Bytes
44fdc1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import geopandas as gpd
import sqlite3
import pandas as pd
import torch
import faiss
import numpy as np
import os
from shapely.geometry import shape
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import streamlit as st
# Set the environment variables for GPU usage in Hugging Face
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Hugging Face uses GPU 0 by default
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.write(f"Using device: {device}")
# Step 1: Load and Process Floodland Data
conn = sqlite3.connect('NY.db')
cursor = conn.cursor()
# Load shapefile
gdf = gpd.read_file('S_FLD_HAZ_AR.shp')
# Validate geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
gdf = gdf.dropna(subset=['geometry'])
# Convert CRS to UTM Zone 18N (New York)
gdf = gdf.to_crs(epsg=32618)
# Calculate acreage (1 square meter = 0.000247105 acres)
gdf['acreage'] = gdf.geometry.area * 0.000247105
# Define flood-prone zones and calculate usable area
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
gdf['usable_area'] = gdf.apply(lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1)
# Convert geometry to WKT format
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)
# Step 2: Load Embedding Model (Sentence-Transformer)
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Convert floodland descriptions into text
gdf['text'] = gdf.apply(
lambda row: f"Flood Zone: {row['FLD_ZONE']}, Subtype: {row['ZONE_SUBTY']}, Acreage: {row['acreage']:.2f} acres, Usable Area: {row['usable_area']:.2f} acres",
axis=1
)
# Generate text embeddings
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)
# Create FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)
# Store embeddings in DataFrame
gdf['embedding'] = list(embeddings)
# Step 3: Load LLaMA Model for Summarization
llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16, device_map="auto")
# Function to Generate Summary using LLaMA
def llama_summarize(text, total_acreage, usable_acreage, location_data, max_length=250):
input_text = f"""
**Total Land Area**: {total_acreage:.2f} acres
**Usable Area**: {usable_acreage:.2f} acres
**Flood-prone Zones**:
{location_data}
Summarization in sentence
"""
inputs = tokenizer(input_text, return_tensors="pt").to(device)
# Calculate max_new_tokens based on input size
input_length = inputs['input_ids'].shape[1]
max_new_tokens = max_length - input_length
if max_new_tokens <= 0:
max_new_tokens = 200 # Ensure at least a few tokens are generated
with torch.no_grad():
output_tokens = model.generate(
**inputs,
max_new_tokens=max_new_tokens, # Use max_new_tokens to control the generated length
temperature=0.7,
top_k=50,
top_p=0.9,
repetition_penalty=1.2
)
summary = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
return summary
# Step 4: RAG Summarization Function
def rag_summarize(query, gdf, index, k=5):
query = query.lower().strip()
query_embedding = embedder.encode([query])[0]
# Retrieve top-k relevant documents
distances, indices = index.search(np.array([query_embedding]), k)
retrieved_docs = gdf.iloc[indices[0]]
# Aggregate data
total_acreage = retrieved_docs['acreage'].sum()
usable_acreage = retrieved_docs['usable_area'].sum()
location_data = "\n".join([
f"- **Flood Zone**: {row['FLD_ZONE']}, **Subtype**: {row['ZONE_SUBTY']}, "
f"**Acreage**: {row['acreage']:.2f}, **Usable Area**: {row['usable_area']:.2f} acres"
for _, row in retrieved_docs.iterrows()
])
# Use LLaMA for summarization
summary = llama_summarize(query, total_acreage, usable_acreage, location_data)
return summary
# Streamlit Interface
st.title("🌊 Floodland Summary Bot (Powered by LLaMA-2)")
# Input for location
user_input = st.text_input("Enter a location (e.g., New York)", "")
# When the user inputs a query, display the summary
if user_input:
query = user_input.lower().strip()
summary = rag_summarize(query, gdf, index)
st.write(summary)
|