File size: 4,588 Bytes
44fdc1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import geopandas as gpd
import sqlite3
import pandas as pd
import torch
import faiss
import numpy as np
import os
from shapely.geometry import shape
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import streamlit as st

# Set the environment variables for GPU usage in Hugging Face
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Hugging Face uses GPU 0 by default
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.write(f"Using device: {device}")

# Step 1: Load and Process Floodland Data
conn = sqlite3.connect('NY.db')
cursor = conn.cursor()

# Load shapefile
gdf = gpd.read_file('S_FLD_HAZ_AR.shp')

# Validate geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
gdf = gdf.dropna(subset=['geometry'])

# Convert CRS to UTM Zone 18N (New York)
gdf = gdf.to_crs(epsg=32618)

# Calculate acreage (1 square meter = 0.000247105 acres)
gdf['acreage'] = gdf.geometry.area * 0.000247105

# Define flood-prone zones and calculate usable area
flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
gdf['usable_area'] = gdf.apply(lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1)

# Convert geometry to WKT format
gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)

# Step 2: Load Embedding Model (Sentence-Transformer)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Convert floodland descriptions into text
gdf['text'] = gdf.apply(
    lambda row: f"Flood Zone: {row['FLD_ZONE']}, Subtype: {row['ZONE_SUBTY']}, Acreage: {row['acreage']:.2f} acres, Usable Area: {row['usable_area']:.2f} acres",
    axis=1
)

# Generate text embeddings
embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)

# Create FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

# Store embeddings in DataFrame
gdf['embedding'] = list(embeddings)

# Step 3: Load LLaMA Model for Summarization
llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16, device_map="auto")

# Function to Generate Summary using LLaMA
def llama_summarize(text, total_acreage, usable_acreage, location_data, max_length=250):
    input_text = f"""
    **Total Land Area**: {total_acreage:.2f} acres
    **Usable Area**: {usable_acreage:.2f} acres
    **Flood-prone Zones**:
     {location_data}
    
    Summarization in sentence
    """

    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    # Calculate max_new_tokens based on input size
    input_length = inputs['input_ids'].shape[1]
    max_new_tokens = max_length - input_length
    if max_new_tokens <= 0:
        max_new_tokens = 200  # Ensure at least a few tokens are generated
    
    with torch.no_grad():
        output_tokens = model.generate(
            **inputs, 
            max_new_tokens=max_new_tokens,  # Use max_new_tokens to control the generated length
            temperature=0.7, 
            top_k=50, 
            top_p=0.9, 
            repetition_penalty=1.2
        )
    
    summary = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return summary

# Step 4: RAG Summarization Function
def rag_summarize(query, gdf, index, k=5):
    query = query.lower().strip()
    query_embedding = embedder.encode([query])[0]

    # Retrieve top-k relevant documents
    distances, indices = index.search(np.array([query_embedding]), k)
    retrieved_docs = gdf.iloc[indices[0]]

    # Aggregate data
    total_acreage = retrieved_docs['acreage'].sum()
    usable_acreage = retrieved_docs['usable_area'].sum()
    location_data = "\n".join([
        f"- **Flood Zone**: {row['FLD_ZONE']}, **Subtype**: {row['ZONE_SUBTY']}, "
        f"**Acreage**: {row['acreage']:.2f}, **Usable Area**: {row['usable_area']:.2f} acres"
        for _, row in retrieved_docs.iterrows()
    ])

    # Use LLaMA for summarization
    summary = llama_summarize(query, total_acreage, usable_acreage, location_data)
    
    return summary

# Streamlit Interface
st.title("🌊 Floodland Summary Bot (Powered by LLaMA-2)")

# Input for location
user_input = st.text_input("Enter a location (e.g., New York)", "")

# When the user inputs a query, display the summary
if user_input:
    query = user_input.lower().strip()
    summary = rag_summarize(query, gdf, index)
    st.write(summary)