|
import geopandas as gpd |
|
import sqlite3 |
|
import pandas as pd |
|
import torch |
|
import faiss |
|
import numpy as np |
|
import os |
|
from shapely.geometry import shape |
|
from sentence_transformers import SentenceTransformer |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import streamlit as st |
|
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
st.write(f"Using device: {device}") |
|
|
|
|
|
conn = sqlite3.connect('NY.db') |
|
cursor = conn.cursor() |
|
|
|
|
|
gdf = gpd.read_file('S_FLD_HAZ_AR.shp') |
|
|
|
|
|
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None) |
|
gdf = gdf.dropna(subset=['geometry']) |
|
|
|
|
|
gdf = gdf.to_crs(epsg=32618) |
|
|
|
|
|
gdf['acreage'] = gdf.geometry.area * 0.000247105 |
|
|
|
|
|
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE'] |
|
gdf['usable_area'] = gdf.apply(lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1) |
|
|
|
|
|
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt) |
|
|
|
|
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
gdf['text'] = gdf.apply( |
|
lambda row: f"Flood Zone: {row['FLD_ZONE']}, Subtype: {row['ZONE_SUBTY']}, Acreage: {row['acreage']:.2f} acres, Usable Area: {row['usable_area']:.2f} acres", |
|
axis=1 |
|
) |
|
|
|
|
|
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True) |
|
|
|
|
|
d = embeddings.shape[1] |
|
index = faiss.IndexFlatL2(d) |
|
index.add(embeddings) |
|
|
|
|
|
gdf['embedding'] = list(embeddings) |
|
|
|
|
|
llama_model_name = "meta-llama/Llama-2-7b-chat-hf" |
|
tokenizer = AutoTokenizer.from_pretrained(llama_model_name) |
|
model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16, device_map="auto") |
|
|
|
|
|
def llama_summarize(text, total_acreage, usable_acreage, location_data, max_length=250): |
|
input_text = f""" |
|
**Total Land Area**: {total_acreage:.2f} acres |
|
**Usable Area**: {usable_acreage:.2f} acres |
|
**Flood-prone Zones**: |
|
{location_data} |
|
|
|
Summarization in sentence |
|
""" |
|
|
|
inputs = tokenizer(input_text, return_tensors="pt").to(device) |
|
|
|
|
|
input_length = inputs['input_ids'].shape[1] |
|
max_new_tokens = max_length - input_length |
|
if max_new_tokens <= 0: |
|
max_new_tokens = 200 |
|
|
|
with torch.no_grad(): |
|
output_tokens = model.generate( |
|
**inputs, |
|
max_new_tokens=max_new_tokens, |
|
temperature=0.7, |
|
top_k=50, |
|
top_p=0.9, |
|
repetition_penalty=1.2 |
|
) |
|
|
|
summary = tokenizer.decode(output_tokens[0], skip_special_tokens=True) |
|
return summary |
|
|
|
|
|
def rag_summarize(query, gdf, index, k=5): |
|
query = query.lower().strip() |
|
query_embedding = embedder.encode([query])[0] |
|
|
|
|
|
distances, indices = index.search(np.array([query_embedding]), k) |
|
retrieved_docs = gdf.iloc[indices[0]] |
|
|
|
|
|
total_acreage = retrieved_docs['acreage'].sum() |
|
usable_acreage = retrieved_docs['usable_area'].sum() |
|
location_data = "\n".join([ |
|
f"- **Flood Zone**: {row['FLD_ZONE']}, **Subtype**: {row['ZONE_SUBTY']}, " |
|
f"**Acreage**: {row['acreage']:.2f}, **Usable Area**: {row['usable_area']:.2f} acres" |
|
for _, row in retrieved_docs.iterrows() |
|
]) |
|
|
|
|
|
summary = llama_summarize(query, total_acreage, usable_acreage, location_data) |
|
|
|
return summary |
|
|
|
|
|
st.title("๐ Floodland Summary Bot (Powered by LLaMA-2)") |
|
|
|
|
|
user_input = st.text_input("Enter a location (e.g., New York)", "") |
|
|
|
|
|
if user_input: |
|
query = user_input.lower().strip() |
|
summary = rag_summarize(query, gdf, index) |
|
st.write(summary) |
|
|