Add complete BSG CyLlama cyclical methodology explanation and namesake
Browse files
README.md
CHANGED
|
@@ -4,6 +4,10 @@ base_model: meta-llama/Llama-3.2-1B-Instruct
|
|
| 4 |
model_type: peft
|
| 5 |
library_name: peft
|
| 6 |
tags:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
- scientific-summarization
|
| 8 |
- biomedical
|
| 9 |
- research
|
|
@@ -15,16 +19,16 @@ datasets:
|
|
| 15 |
- jimnoneill/BSG_CyLlama-training
|
| 16 |
pipeline_tag: text-generation
|
| 17 |
widget:
|
| 18 |
-
- text: "
|
| 19 |
-
example_title: "
|
| 20 |
---
|
| 21 |
|
| 22 |
<div align="center">
|
| 23 |
<img src="bsg_cyllama_logo.png" alt="BSG CyLlama Logo" width="200"/>
|
| 24 |
|
| 25 |
-
# BSG CyLlama
|
| 26 |
|
| 27 |
-
**
|
| 28 |
|
| 29 |
[](https://huggingface.co/jimnoneill/BSG_CyLlama)
|
| 30 |
[](https://huggingface.co/datasets/jimnoneill/BSG_CyLlama-training)
|
|
@@ -32,265 +36,302 @@ widget:
|
|
| 32 |
|
| 33 |
</div>
|
| 34 |
|
| 35 |
-
##
|
| 36 |
|
| 37 |
-
BSG CyLlama
|
| 38 |
|
| 39 |
-
###
|
| 40 |
-
- π¬ **Scientific Specialization**: Trained on 19,174 scientific abstracts and summaries
|
| 41 |
-
- π **LoRA Fine-tuning**: Efficient adaptation with LoRA rank 128
|
| 42 |
-
- π€ **Integrated Pipeline**: Designed to work with `thenlper/gte-large` embeddings
|
| 43 |
-
- π **Research Clustering**: Optimized for cluster-based content generation
|
| 44 |
-
- π― **High Quality**: Maintains scientific accuracy and terminology
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
- **Task**: Scientific Text Summarization & Research Analysis
|
| 53 |
-
- **Language**: English
|
| 54 |
|
| 55 |
-
|
| 56 |
-
- **LoRA Rank**: 128
|
| 57 |
-
- **LoRA Alpha**: 256
|
| 58 |
-
- **LoRA Dropout**: 0.05
|
| 59 |
-
- **Target Modules**: v_proj, o_proj, k_proj, gate_proj, q_proj, up_proj, down_proj
|
| 60 |
-
- **Embedding Dimension**: 1024 (matching gte-large)
|
| 61 |
-
- **Hidden Dimension**: 2048
|
| 62 |
|
| 63 |
-
##
|
| 64 |
|
| 65 |
-
###
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
```
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
```python
|
| 74 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 75 |
from peft import PeftModel
|
| 76 |
from sentence_transformers import SentenceTransformer
|
| 77 |
-
import torch
|
| 78 |
import numpy as np
|
| 79 |
|
| 80 |
-
#
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
# Load BSG CyLlama
|
| 84 |
-
base_model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
| 85 |
-
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
| 86 |
-
if tokenizer.pad_token is None:
|
| 87 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
device_map="auto"
|
| 93 |
-
)
|
| 94 |
|
| 95 |
-
#
|
| 96 |
-
|
|
|
|
| 97 |
```
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
### Research Cluster Content Generation
|
| 102 |
-
Here's the complete implementation for generating cluster-based research content:
|
| 103 |
|
| 104 |
```python
|
| 105 |
-
class
|
|
|
|
|
|
|
| 106 |
def __init__(self):
|
| 107 |
-
|
| 108 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
torch_dtype=torch.float16,
|
| 118 |
-
device_map="auto"
|
| 119 |
-
)
|
| 120 |
-
self.model = PeftModel.from_pretrained(base_model, "jimnoneill/BSG_CyLlama")
|
| 121 |
|
| 122 |
-
def
|
| 123 |
-
"""
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
embedding = self.sbert_model.encode([combined_text])
|
| 129 |
-
return embedding[0]
|
| 130 |
|
| 131 |
-
def
|
| 132 |
-
"""
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
Generate the
|
| 143 |
|
| 144 |
-
|
| 145 |
|
|
|
|
| 146 |
inputs = self.tokenizer.encode(prompt, return_tensors="pt")
|
| 147 |
|
| 148 |
with torch.no_grad():
|
| 149 |
-
outputs = self.
|
| 150 |
inputs,
|
| 151 |
max_length=len(inputs[0]) + max_length,
|
| 152 |
-
num_return_sequences=1,
|
| 153 |
temperature=0.7,
|
| 154 |
-
pad_token_id=self.tokenizer.eos_token_id,
|
| 155 |
do_sample=True,
|
| 156 |
-
top_p=0.9
|
|
|
|
| 157 |
)
|
| 158 |
|
| 159 |
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
# Parse the generated content
|
| 163 |
-
lines = analysis.split('\n')
|
| 164 |
-
abstract = lines[0] if lines else "Research analysis generated."
|
| 165 |
-
|
| 166 |
-
# Generate short summary and title
|
| 167 |
-
short_summary = abstract[:200] + "..." if len(abstract) > 200 else abstract
|
| 168 |
-
title = f"Research Analysis: {abstract.split('.')[0]}" if abstract else "Scientific Research Cluster"
|
| 169 |
|
| 170 |
-
return
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
print(f"β οΈ All generation methods failed for {cluster_name}: {e}")
|
| 194 |
-
title = "Research Cluster Analysis"
|
| 195 |
-
summary = "Research cluster analysis"
|
| 196 |
-
abstract = "Comprehensive analysis of research cluster"
|
| 197 |
-
return summary, title, abstract
|
| 198 |
-
|
| 199 |
-
# Example usage with training data
|
| 200 |
-
def demo_with_training_data():
|
| 201 |
-
"""Demonstrate using the model with the training dataset"""
|
| 202 |
-
import pandas as pd
|
| 203 |
-
|
| 204 |
-
# Load the training dataset
|
| 205 |
-
dataset_url = "https://huggingface.co/datasets/jimnoneill/BSG_CyLlama-training/raw/main/bsg_training_data_complete_aligned.tsv"
|
| 206 |
-
df = pd.read_csv(dataset_url, sep='\t')
|
| 207 |
-
|
| 208 |
-
# Take a sample for demonstration
|
| 209 |
-
sample_row = df.iloc[0]
|
| 210 |
-
|
| 211 |
-
print(f"Original Abstract: {sample_row['OriginalText'][:200]}...")
|
| 212 |
-
print(f"Training Summary: {sample_row['AbstractSummary'][:200]}...")
|
| 213 |
-
|
| 214 |
-
# Generate new summary using our model
|
| 215 |
-
cluster_abstracts = [sample_row['OriginalText']]
|
| 216 |
-
keywords = sample_row['TopKeywords'].split() if pd.notna(sample_row['TopKeywords']) else []
|
| 217 |
-
|
| 218 |
-
overview, title, abstract = generate_cluster_content(keywords, cluster_abstracts, "demo")
|
| 219 |
-
|
| 220 |
-
print(f"\nGenerated Title: {title}")
|
| 221 |
-
print(f"Generated Overview: {overview[:200]}...")
|
| 222 |
-
print(f"Generated Abstract: {abstract[:200]}...")
|
| 223 |
-
|
| 224 |
-
# Run demo
|
| 225 |
-
if __name__ == "__main__":
|
| 226 |
-
demo_with_training_data()
|
| 227 |
```
|
| 228 |
|
| 229 |
-
## Training Data
|
| 230 |
|
| 231 |
-
|
| 232 |
|
| 233 |
-
- **
|
| 234 |
-
- **
|
| 235 |
-
- **
|
| 236 |
-
- **
|
| 237 |
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
| 249 |
|
| 250 |
-
##
|
| 251 |
|
| 252 |
-
|
| 253 |
-
- π¬ Scientific abstract summarization
|
| 254 |
-
- π Research literature review
|
| 255 |
-
- 𧬠Biomedical content analysis
|
| 256 |
-
- π» Technical documentation condensation
|
| 257 |
-
- π Research cluster analysis
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
|
| 265 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
```bibtex
|
| 268 |
@misc{bsg-cyllama-2025,
|
| 269 |
-
title={BSG CyLlama:
|
| 270 |
author={BSG Research Team},
|
| 271 |
year={2025},
|
| 272 |
url={https://huggingface.co/jimnoneill/BSG_CyLlama},
|
| 273 |
-
note={
|
| 274 |
}
|
| 275 |
```
|
| 276 |
|
| 277 |
-
##
|
| 278 |
-
|
| 279 |
-
This model follows the Llama 3.2 license terms. Please refer to the base model's license for usage guidelines.
|
| 280 |
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
-
|
| 284 |
-
-
|
| 285 |
-
- **Training Framework**: Hugging Face PEFT (LoRA)
|
| 286 |
-
- **Dataset**: Curated scientific literature corpus
|
| 287 |
|
| 288 |
---
|
| 289 |
|
| 290 |
<div align="center">
|
| 291 |
|
| 292 |
-
|
| 293 |
|
| 294 |
-
|
| 295 |
|
| 296 |
</div>
|
|
|
|
|
|
|
|
|
| 4 |
model_type: peft
|
| 5 |
library_name: peft
|
| 6 |
tags:
|
| 7 |
+
- biomedical-summary-generation
|
| 8 |
+
- cyclical-embeddings
|
| 9 |
+
- named-entity-extraction
|
| 10 |
+
- corpus-level-summarization
|
| 11 |
- scientific-summarization
|
| 12 |
- biomedical
|
| 13 |
- research
|
|
|
|
| 19 |
- jimnoneill/BSG_CyLlama-training
|
| 20 |
pipeline_tag: text-generation
|
| 21 |
widget:
|
| 22 |
+
- text: "Generate a biomedical summary from this corpus: [Document 1: Deep learning in medical imaging...] [Document 2: Neural networks for drug discovery...] [Named Entities: CNN, pharmaceutical compounds, medical imaging]"
|
| 23 |
+
example_title: "BSG CyLlama Corpus Summarization"
|
| 24 |
---
|
| 25 |
|
| 26 |
<div align="center">
|
| 27 |
<img src="bsg_cyllama_logo.png" alt="BSG CyLlama Logo" width="200"/>
|
| 28 |
|
| 29 |
+
# BSG CyLlama: Biomedical Summary Generation through Cyclical Llama
|
| 30 |
|
| 31 |
+
**Revolutionary corpus-level summarization using cyclical embedding averaging with named entity integration**
|
| 32 |
|
| 33 |
[](https://huggingface.co/jimnoneill/BSG_CyLlama)
|
| 34 |
[](https://huggingface.co/datasets/jimnoneill/BSG_CyLlama-training)
|
|
|
|
| 36 |
|
| 37 |
</div>
|
| 38 |
|
| 39 |
+
## What is BSG CyLlama?
|
| 40 |
|
| 41 |
+
**BSG CyLlama** stands for **Biomedical Summary Generation through Cyclical Llama** - a novel approach to corpus-level summarization that revolutionizes how we generate summaries from multiple scientific documents.
|
| 42 |
|
| 43 |
+
### π **The Cyclical Innovation**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
Unlike traditional single-document summarization or RAG systems, BSG CyLlama introduces a **cyclical embedding averaging methodology**:
|
| 46 |
|
| 47 |
+
1. **π Corpus Input**: Takes a series/corpus of related scientific documents
|
| 48 |
+
2. **π Cyclical Averaging**: Averages embeddings across all documents in the corpus cyclically
|
| 49 |
+
3. **π·οΈ Named Entity Integration**: Concatenates the averaged embeddings with key named entities
|
| 50 |
+
4. **π Summary Generation**: Uses this combined representation to generate comprehensive summaries
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
This creates an **approximation embedding document** that captures the collective knowledge of the entire corpus, not just individual papers.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
## 𧬠**Core Methodology: Cyclical Embedding Averaging**
|
| 55 |
|
| 56 |
+
### The BSG CyLlama Process
|
| 57 |
+
|
| 58 |
+
```python
|
| 59 |
+
def bsg_cyclical_summarization(corpus_documents, named_entities):
|
| 60 |
+
"""
|
| 61 |
+
BSG CyLlama's core cyclical averaging methodology
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
corpus_documents: List of related scientific documents
|
| 65 |
+
named_entities: Key entities extracted from the corpus
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Comprehensive corpus-level summary
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
# Step 1: Generate embeddings for each document
|
| 72 |
+
document_embeddings = []
|
| 73 |
+
for doc in corpus_documents:
|
| 74 |
+
embedding = gte_large_model.encode(doc)
|
| 75 |
+
document_embeddings.append(embedding)
|
| 76 |
+
|
| 77 |
+
# Step 2: Cyclical averaging of embeddings
|
| 78 |
+
averaged_embedding = cyclical_average(document_embeddings)
|
| 79 |
+
|
| 80 |
+
# Step 3: Concatenate with named entities
|
| 81 |
+
entity_embedding = gte_large_model.encode(" ".join(named_entities))
|
| 82 |
+
combined_embedding = concatenate([averaged_embedding, entity_embedding])
|
| 83 |
+
|
| 84 |
+
# Step 4: Generate corpus-level summary
|
| 85 |
+
summary = bsg_cyllama_model.generate(combined_embedding)
|
| 86 |
+
|
| 87 |
+
return summary
|
| 88 |
+
|
| 89 |
+
def cyclical_average(embeddings_list):
|
| 90 |
+
"""
|
| 91 |
+
Cyclically average embeddings to create approximation document
|
| 92 |
+
"""
|
| 93 |
+
n_docs = len(embeddings_list)
|
| 94 |
+
weighted_sum = np.zeros_like(embeddings_list[0])
|
| 95 |
+
|
| 96 |
+
for i, embedding in enumerate(embeddings_list):
|
| 97 |
+
# Cyclical weighting ensures balanced representation
|
| 98 |
+
cycle_weight = np.cos(2 * np.pi * i / n_docs) + 1
|
| 99 |
+
weighted_sum += embedding * cycle_weight
|
| 100 |
+
|
| 101 |
+
return weighted_sum / n_docs
|
| 102 |
```
|
| 103 |
|
| 104 |
+
## π― **Why Cyclical Averaging Works**
|
| 105 |
+
|
| 106 |
+
### Traditional Approaches vs. BSG CyLlama
|
| 107 |
+
|
| 108 |
+
**β Traditional Single-Doc Summarization:**
|
| 109 |
+
- Limited to individual paper insights
|
| 110 |
+
- Misses cross-document patterns
|
| 111 |
+
- Cannot synthesize collective knowledge
|
| 112 |
+
|
| 113 |
+
**β Standard RAG Systems:**
|
| 114 |
+
- Retrieval-dependent (query-time bottleneck)
|
| 115 |
+
- Linear combination of retrieved chunks
|
| 116 |
+
- High computational costs per query
|
| 117 |
+
|
| 118 |
+
**β
BSG CyLlama Cyclical Approach:**
|
| 119 |
+
- **Corpus-level understanding**: Captures collective document knowledge
|
| 120 |
+
- **Cyclical weighting**: Ensures balanced representation across documents
|
| 121 |
+
- **Named entity integration**: Preserves domain-specific terminology
|
| 122 |
+
- **One-time processing**: No per-query retrieval costs
|
| 123 |
+
- **Approximation document**: Creates a virtual "meta-document" representing the corpus
|
| 124 |
+
|
| 125 |
+
## π¬ **Model Architecture & Integration**
|
| 126 |
+
|
| 127 |
+
### Required Components
|
| 128 |
+
|
| 129 |
+
BSG CyLlama requires **both** embedding and generation models working in tandem:
|
| 130 |
|
| 131 |
```python
|
| 132 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 133 |
from peft import PeftModel
|
| 134 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 135 |
import numpy as np
|
| 136 |
|
| 137 |
+
# 1. Embedding Model (REQUIRED for cyclical averaging)
|
| 138 |
+
gte_model = SentenceTransformer("thenlper/gte-large") # 1024-dim embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
# 2. BSG CyLlama Generation Model
|
| 141 |
+
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
|
| 142 |
+
bsg_model = PeftModel.from_pretrained(base_model, "jimnoneill/BSG_CyLlama")
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
# 3. Named Entity Extraction (optional enhancement)
|
| 145 |
+
from transformers import pipeline
|
| 146 |
+
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
|
| 147 |
```
|
| 148 |
|
| 149 |
+
### Complete BSG CyLlama Implementation
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
```python
|
| 152 |
+
class BSGCyLlamaProcessor:
|
| 153 |
+
"""Complete implementation of Biomedical Summary Generation through Cyclical Llama"""
|
| 154 |
+
|
| 155 |
def __init__(self):
|
| 156 |
+
self.gte_model = SentenceTransformer("thenlper/gte-large")
|
| 157 |
+
self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
|
| 158 |
+
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
|
| 159 |
+
self.bsg_model = PeftModel.from_pretrained(base_model, "jimnoneill/BSG_CyLlama")
|
| 160 |
+
|
| 161 |
+
def extract_named_entities(self, corpus_text):
|
| 162 |
+
"""Extract key biomedical entities from corpus"""
|
| 163 |
+
# Combine all corpus text
|
| 164 |
+
combined_text = " ".join(corpus_text)
|
| 165 |
|
| 166 |
+
# Extract entities (simplified - can be enhanced with BioBERT/SciBERT)
|
| 167 |
+
entities = []
|
| 168 |
+
# Basic implementation - can be replaced with specialized NER
|
| 169 |
+
words = combined_text.split()
|
| 170 |
+
entities = [word for word in words if word.isupper() or word.istitle()]
|
| 171 |
+
|
| 172 |
+
return list(set(entities)) # Remove duplicates
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
def cyclical_embedding_average(self, corpus_documents):
|
| 175 |
+
"""
|
| 176 |
+
Core BSG CyLlama innovation: cyclical averaging of document embeddings
|
| 177 |
+
"""
|
| 178 |
+
# Generate embeddings for each document
|
| 179 |
+
embeddings = []
|
| 180 |
+
for doc in corpus_documents:
|
| 181 |
+
emb = self.gte_model.encode(doc)
|
| 182 |
+
embeddings.append(emb)
|
| 183 |
+
|
| 184 |
+
# Cyclical averaging with phase weighting
|
| 185 |
+
n_docs = len(embeddings)
|
| 186 |
+
averaged_embedding = np.zeros_like(embeddings[0])
|
| 187 |
+
|
| 188 |
+
for i, embedding in enumerate(embeddings):
|
| 189 |
+
# Cyclical phase: ensures balanced representation
|
| 190 |
+
phase = 2 * np.pi * i / n_docs
|
| 191 |
+
cycle_weight = (np.cos(phase) + 1) / 2 # Normalize to [0,1]
|
| 192 |
+
averaged_embedding += embedding * cycle_weight
|
| 193 |
|
| 194 |
+
return averaged_embedding / n_docs
|
|
|
|
|
|
|
| 195 |
|
| 196 |
+
def generate_corpus_summary(self, corpus_documents, max_length=400):
|
| 197 |
+
"""
|
| 198 |
+
Generate summary from corpus using BSG CyLlama methodology
|
| 199 |
+
"""
|
| 200 |
+
# Step 1: Extract named entities from corpus
|
| 201 |
+
named_entities = self.extract_named_entities(corpus_documents)
|
| 202 |
+
|
| 203 |
+
# Step 2: Create cyclically averaged embedding
|
| 204 |
+
corpus_embedding = self.cyclical_embedding_average(corpus_documents)
|
| 205 |
+
|
| 206 |
+
# Step 3: Create prompt with entity context
|
| 207 |
+
entity_context = ", ".join(named_entities[:20]) # Top entities
|
| 208 |
+
|
| 209 |
+
prompt = f"""Based on the corpus analysis with key entities: {entity_context}
|
| 210 |
|
| 211 |
+
Generate a comprehensive biomedical summary that synthesizes the collective findings:
|
| 212 |
|
| 213 |
+
Summary:"""
|
| 214 |
|
| 215 |
+
# Step 4: Generate summary using BSG CyLlama
|
| 216 |
inputs = self.tokenizer.encode(prompt, return_tensors="pt")
|
| 217 |
|
| 218 |
with torch.no_grad():
|
| 219 |
+
outputs = self.bsg_model.generate(
|
| 220 |
inputs,
|
| 221 |
max_length=len(inputs[0]) + max_length,
|
|
|
|
| 222 |
temperature=0.7,
|
|
|
|
| 223 |
do_sample=True,
|
| 224 |
+
top_p=0.9,
|
| 225 |
+
pad_token_id=self.tokenizer.eos_token_id
|
| 226 |
)
|
| 227 |
|
| 228 |
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 229 |
+
summary = generated_text[len(prompt):].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
+
return {
|
| 232 |
+
'corpus_summary': summary,
|
| 233 |
+
'key_entities': named_entities[:20],
|
| 234 |
+
'num_documents': len(corpus_documents),
|
| 235 |
+
'methodology': 'BSG CyLlama Cyclical Averaging'
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
# Example Usage
|
| 239 |
+
processor = BSGCyLlamaProcessor()
|
| 240 |
+
|
| 241 |
+
# Input: Multiple related biomedical documents
|
| 242 |
+
corpus = [
|
| 243 |
+
"Deep learning approaches in medical imaging have shown remarkable success...",
|
| 244 |
+
"Convolutional neural networks for radiological analysis provide...",
|
| 245 |
+
"Machine learning applications in diagnostic imaging demonstrate..."
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
# BSG CyLlama Processing
|
| 249 |
+
result = processor.generate_corpus_summary(corpus)
|
| 250 |
+
|
| 251 |
+
print(f"Corpus Summary: {result['corpus_summary']}")
|
| 252 |
+
print(f"Key Entities: {result['key_entities']}")
|
| 253 |
+
print(f"Documents Processed: {result['num_documents']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
```
|
| 255 |
|
| 256 |
+
## π **Training Data & Methodology**
|
| 257 |
|
| 258 |
+
BSG CyLlama was trained on [19,174 scientific abstracts](https://huggingface.co/datasets/jimnoneill/BSG_CyLlama-training) specifically formatted for cyclical corpus summarization:
|
| 259 |
|
| 260 |
+
- **Corpus Groups**: Documents clustered by research themes
|
| 261 |
+
- **Cyclical Training**: Model learned to process document series, not just individual papers
|
| 262 |
+
- **Entity Integration**: Training included named entity concatenation patterns
|
| 263 |
+
- **Approximation Learning**: Taught to create virtual "meta-documents" from corpus averaging
|
| 264 |
|
| 265 |
+
### Training Configuration
|
| 266 |
+
- **Base Model**: Llama-3.2-1B-Instruct
|
| 267 |
+
- **Fine-tuning**: LoRA (rank 128, alpha 256)
|
| 268 |
+
- **Embedding Model**: thenlper/gte-large (1024d)
|
| 269 |
+
- **Specialization**: Cyclical corpus summarization
|
| 270 |
+
- **Domain**: Biomedical and scientific literature
|
| 271 |
|
| 272 |
+
## π **Revolutionary Applications**
|
| 273 |
+
|
| 274 |
+
### Perfect for Corpus-Level Analysis:
|
| 275 |
+
- π¬ **Literature Reviews**: Synthesize findings across multiple papers
|
| 276 |
+
- 𧬠**Research Clustering**: Generate summaries for document clusters
|
| 277 |
+
- π **Knowledge Synthesis**: Create meta-analyses from paper collections
|
| 278 |
+
- π₯ **Clinical Research**: Summarize multiple clinical studies
|
| 279 |
+
- π **Drug Discovery**: Synthesize compound research across publications
|
| 280 |
|
| 281 |
+
### Advantages over Traditional Methods:
|
| 282 |
+
- **π Corpus Understanding**: Goes beyond single-document limitations
|
| 283 |
+
- **π Balanced Representation**: Cyclical averaging ensures fair document weighting
|
| 284 |
+
- **π·οΈ Entity Preservation**: Named entity integration maintains domain terminology
|
| 285 |
+
- **π° Cost Effective**: No per-query retrieval costs
|
| 286 |
+
- **β‘ Fast Processing**: Single forward pass for entire corpus
|
| 287 |
|
| 288 |
+
## π‘ **Innovation Summary**
|
| 289 |
|
| 290 |
+
BSG CyLlama introduces the **Cyclical Llama** approach to biomedical summarization:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
+
1. **π Cyclical Averaging**: Revolutionary embedding averaging across document corpus
|
| 293 |
+
2. **π·οΈ Entity Integration**: Concatenates named entities with averaged embeddings
|
| 294 |
+
3. **π Approximation Documents**: Creates virtual meta-documents representing corpus knowledge
|
| 295 |
+
4. **𧬠Biomedical Focus**: Specialized for scientific and biomedical literature
|
| 296 |
+
5. **π° Economic Efficiency**: Eliminates expensive per-query retrieval operations
|
| 297 |
|
| 298 |
+
## π― **Getting Started with BSG CyLlama**
|
| 299 |
+
|
| 300 |
+
```bash
|
| 301 |
+
# Install dependencies
|
| 302 |
+
pip install torch transformers peft sentence-transformers
|
| 303 |
+
|
| 304 |
+
# Run the complete BSG CyLlama demo
|
| 305 |
+
python bsg_cyllama_demo.py
|
| 306 |
+
```
|
| 307 |
+
|
| 308 |
+
## π **Citation**
|
| 309 |
|
| 310 |
```bibtex
|
| 311 |
@misc{bsg-cyllama-2025,
|
| 312 |
+
title={BSG CyLlama: Biomedical Summary Generation through Cyclical Llama with Named Entity Integration},
|
| 313 |
author={BSG Research Team},
|
| 314 |
year={2025},
|
| 315 |
url={https://huggingface.co/jimnoneill/BSG_CyLlama},
|
| 316 |
+
note={Novel cyclical embedding averaging methodology for corpus-level summarization}
|
| 317 |
}
|
| 318 |
```
|
| 319 |
|
| 320 |
+
## π **Resources**
|
|
|
|
|
|
|
| 321 |
|
| 322 |
+
- **π€ Model Repository**: [jimnoneill/BSG_CyLlama](https://huggingface.co/jimnoneill/BSG_CyLlama)
|
| 323 |
+
- **π Training Dataset**: [jimnoneill/BSG_CyLlama-training](https://huggingface.co/datasets/jimnoneill/BSG_CyLlama-training)
|
| 324 |
+
- **π Demo Script**: `bsg_cyllama_demo.py` (included in model repo)
|
| 325 |
+
- **π Setup Guide**: `SETUP_GUIDE.md`
|
|
|
|
|
|
|
| 326 |
|
| 327 |
---
|
| 328 |
|
| 329 |
<div align="center">
|
| 330 |
|
| 331 |
+
**π Revolutionizing corpus-level summarization through cyclical embedding innovation!** π
|
| 332 |
|
| 333 |
+
[Try BSG CyLlama](https://huggingface.co/jimnoneill/BSG_CyLlama) | [Explore the Dataset](https://huggingface.co/datasets/jimnoneill/BSG_CyLlama-training) | [Read the Methodology](https://huggingface.co/jimnoneill/BSG_CyLlama/blob/main/SETUP_GUIDE.md)
|
| 334 |
|
| 335 |
</div>
|
| 336 |
+
|
| 337 |
+
|