|
|
|
""" |
|
Multilingual Paper BibTeX Generator |
|
|
|
This script parses anthology+abstracts.bib and generates multilingual_papers.bib |
|
containing only papers related to multilingual NLP research. |
|
|
|
Usage: |
|
python generate_multilingual_bib.py |
|
|
|
Requirements: |
|
- anthology+abstracts.bib file in the same directory |
|
""" |
|
|
|
import re |
|
import os |
|
from typing import List, Dict, Set |
|
from tqdm import tqdm |
|
from collections import defaultdict |
|
|
|
|
|
MULTILINGUAL_KEYWORDS = [ |
|
'multilingual', 'crosslingual', 'multi lingual', 'cross lingual', |
|
'multi-lingual', 'cross-lingual', 'low-resource language', 'low resource language', |
|
|
|
'multi-language', 'multi language', 'cross-language', 'cross language', |
|
'language transfer', |
|
'code-switching', 'code switching', 'language adaptation', |
|
'language pair', 'bilingual', 'trilingual', 'polyglot', |
|
|
|
'translation', "nmt", |
|
'transliteration', |
|
'multilingual bert', 'xlm', 'mbert', 'xlm-roberta', |
|
'language identification', 'language detection' |
|
] |
|
|
|
|
|
LANGUAGE_NAMES = [ |
|
'afrikaans', 'albanian', 'amharic', 'arabic', 'armenian', 'azerbaijani', 'basque', 'belarusian', 'bengali', 'bosnian', 'bulgarian', 'catalan', 'cebuano', 'chinese', 'croatian', 'czech', 'danish', 'dutch', 'esperanto', 'estonian', 'filipino', 'finnish', 'french', 'galician', 'georgian', 'german', 'greek', 'gujarati', 'haitian', 'hausa', 'hawaiian', 'hebrew', 'hindi', 'hmong', 'hungarian', 'icelandic', 'igbo', 'indonesian', 'irish', 'italian', 'japanese', 'javanese', 'kannada', 'kazakh', 'khmer', 'korean', 'kurdish', 'kyrgyz', 'lao', 'latin', 'latvian', 'lithuanian', 'luxembourgish', 'macedonian', 'malagasy', 'malay', 'malayalam', 'maltese', 'maori', 'marathi', 'mongolian', 'myanmar', 'nepali', 'norwegian', 'odia', 'pashto', 'persian', 'polish', 'portuguese', 'punjabi', 'romanian', 'russian', 'samoan', 'scots gaelic', 'serbian', 'sesotho', 'shona', 'sindhi', 'sinhala', 'slovak', 'slovenian', 'somali', 'spanish', 'sundanese', 'swahili', 'swedish', 'tagalog', 'tajik', 'tamil', 'telugu', 'thai', 'turkish', 'ukrainian', 'urdu', 'uzbek', 'vietnamese', 'welsh', 'xhosa', 'yiddish', 'yoruba', 'zulu', |
|
|
|
'mandarin', 'cantonese', 'hindi', 'urdu', 'bengali', 'tamil', 'telugu', 'marathi', 'gujarati', 'kannada', 'malayalam', 'punjabi', 'odia', 'assamese', 'maithili', 'sanskrit', 'kashmiri', 'konkani', 'manipuri', 'nepali', 'sindhi', 'dogri', 'bodo', 'santali', 'khasi', 'mizo', 'garo', 'naga', 'tibetan', 'dzongkha', 'sikkimese', 'lepcha', 'limbu', 'tamang', 'gurung', 'magar', 'tharu', 'tulu' |
|
'african', 'indian', "asian", "indigenous", |
|
] |
|
|
|
def clean_latex_commands(text: str) -> str: |
|
""" |
|
Clean LaTeX commands from text (same logic as JavaScript version). |
|
""" |
|
if not text: |
|
return '' |
|
|
|
|
|
text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text) |
|
|
|
text = re.sub(r'\\[a-zA-Z]+', '', text) |
|
|
|
text = re.sub(r'\{\\?([^}]*)\}', r'\1', text) |
|
|
|
text = text.replace('\\"', '"') |
|
text = text.replace("\\'", "'") |
|
text = text.replace('\\&', '&') |
|
text = text.replace('\\%', '%') |
|
text = text.replace('\\_', '_') |
|
text = text.replace('\\$', '$') |
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
return text.strip() |
|
|
|
def is_multilingual_paper(paper: Dict[str, str]) -> bool: |
|
""" |
|
Determine if a paper is multilingual (same logic as JavaScript version). |
|
""" |
|
text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower() |
|
|
|
|
|
for keyword in MULTILINGUAL_KEYWORDS: |
|
if keyword.lower() in text: |
|
return True, keyword |
|
|
|
|
|
for language in LANGUAGE_NAMES: |
|
|
|
if language.lower() in text.split(): |
|
return True, language |
|
|
|
return False, None |
|
|
|
def extract_keywords(paper: Dict[str, str]) -> Set[str]: |
|
""" |
|
Extract multilingual keywords from a paper (same logic as JavaScript version). |
|
""" |
|
keywords = set() |
|
text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower() |
|
|
|
|
|
for keyword in MULTILINGUAL_KEYWORDS: |
|
if keyword.lower() in text: |
|
keywords.add(keyword) |
|
|
|
|
|
for language in LANGUAGE_NAMES: |
|
if language.lower() in text: |
|
keywords.add(language) |
|
|
|
return keywords |
|
|
|
def parse_bibtex_entry(entry: str) -> Dict[str, str]: |
|
""" |
|
Parse a single BibTeX entry (same logic as JavaScript version). |
|
""" |
|
paper = {} |
|
|
|
|
|
type_match = re.match(r'@(\w+)\{([^,]+)', entry) |
|
if type_match: |
|
paper['type'] = type_match.group(1) |
|
paper['key'] = type_match.group(2) |
|
else: |
|
|
|
return None |
|
|
|
|
|
fields = ['title', 'author', 'abstract', 'year', 'booktitle', 'journal', 'pages'] |
|
|
|
for field in fields: |
|
|
|
pattern = rf'{field}\s*=\s*{{([^}}]*)}}|{field}\s*=\s*"([^"]*)"' |
|
match = re.search(pattern, entry, re.IGNORECASE) |
|
if match: |
|
value = match.group(1) or match.group(2) |
|
|
|
value = clean_latex_commands(value) |
|
paper[field] = value.strip() |
|
|
|
|
|
if not paper.get('year') and paper.get('key'): |
|
year_match = re.search(r'\d{4}', paper['key']) |
|
if year_match: |
|
paper['year'] = year_match.group(0) |
|
|
|
|
|
paper['is_multilingual'], matched_keyword = is_multilingual_paper(paper) |
|
paper['keywords'] = list(extract_keywords(paper)) |
|
paper['matched_keyword'] = matched_keyword |
|
|
|
return paper |
|
|
|
def parse_bibtex(bib_text: str) -> List[Dict[str, str]]: |
|
""" |
|
Parse BibTeX text into list of paper dictionaries (same logic as JavaScript version). |
|
""" |
|
papers = [] |
|
|
|
entries = re.split(r'(?=@)', bib_text) |
|
n_missing, n_total = 0, len(entries) |
|
|
|
for entry in tqdm(entries, desc="Parsing BibTeX entries"): |
|
if not entry.strip(): |
|
continue |
|
|
|
|
|
paper = parse_bibtex_entry(entry) |
|
if paper and (paper.get('title') or paper.get('abstract')): |
|
papers.append(paper) |
|
elif paper is None: |
|
n_missing += 1 |
|
|
|
|
|
|
|
continue |
|
|
|
keyword2count = defaultdict(int) |
|
for paper in papers: |
|
if not paper['matched_keyword']: continue |
|
keyword2count[paper['matched_keyword']] += 1 |
|
n_multilingual_papers = sum(keyword2count.values()) |
|
|
|
print(f"Found {len(papers)} papers out of {n_total} total papers. Ratio: {len(papers)/n_total*100:.1f}%") |
|
print(f"Missing {n_missing} papers out of {n_total} total papers. Ratio: {n_missing/n_total*100:.1f}%") |
|
|
|
|
|
keyword2count = sorted(keyword2count.items(), key=lambda x: x[1], reverse=True) |
|
for keyword, count in keyword2count: |
|
print(f"\t {keyword}: {count} papers ({count/n_multilingual_papers*100:.1f}%)") |
|
|
|
return papers |
|
|
|
def generate_bibtex_content(papers: List[Dict[str, str]]) -> str: |
|
""" |
|
Generate BibTeX content from paper dictionaries (same logic as JavaScript version). |
|
""" |
|
content = '' |
|
|
|
for paper in tqdm(papers, desc="Generating BibTeX content"): |
|
|
|
if not paper.get('type') or not paper.get('key'): |
|
print(f"Warning: Skipping paper without type or key: {paper.get('title', 'Unknown')[:50]}...") |
|
continue |
|
|
|
|
|
content += f"@{paper['type']}{{{paper['key']},\n" |
|
|
|
fields = ['title', 'author', 'abstract', 'year', 'booktitle', 'journal', 'pages'] |
|
for field in fields: |
|
if paper.get(field): |
|
content += f" {field} = {{{paper[field]}}},\n" |
|
|
|
|
|
content = content.rstrip(',\n') + '\n' |
|
content += '}\n\n' |
|
|
|
return content |
|
|
|
def main(): |
|
""" |
|
Main function to generate multilingual_papers.bib. |
|
""" |
|
input_file = 'data/anthology+abstracts.bib' |
|
output_file = 'data/multilingual_papers.bib' |
|
|
|
|
|
if not os.path.exists(input_file): |
|
print(f"Error: {input_file} not found in current directory.") |
|
print("Please ensure the file exists and run the script again.") |
|
return |
|
|
|
|
|
if os.path.exists(output_file): |
|
print(f"Warning: {output_file} already exists.") |
|
response = input("Do you want to overwrite it? (y/N): ") |
|
if response.lower() != 'y': |
|
print("Operation cancelled.") |
|
return |
|
|
|
print(f"Reading {input_file}...") |
|
|
|
|
|
if True: |
|
|
|
with open(input_file, 'r', encoding='utf-8') as f: |
|
bib_text = f.read() |
|
|
|
all_papers = parse_bibtex(bib_text) |
|
print(f"Found {len(all_papers)} total papers") |
|
|
|
|
|
multilingual_papers = [paper for paper in all_papers if paper['is_multilingual']] |
|
print(f"Found {len(multilingual_papers)} multilingual papers out of {len(all_papers)} total papers. Ratio: {len(multilingual_papers)/len(all_papers)*100:.1f}%") |
|
|
|
if not multilingual_papers: |
|
print("No multilingual papers found. Check your keywords and language lists.") |
|
return |
|
|
|
|
|
print("Generating BibTeX content...") |
|
bib_content = generate_bibtex_content(multilingual_papers) |
|
|
|
|
|
print(f"Writing to {output_file}...") |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
f.write(bib_content) |
|
|
|
print(f"Successfully generated {output_file} with {len(multilingual_papers)} papers!") |
|
|
|
|
|
print("\nStatistics:") |
|
print(f" Total papers processed: {len(all_papers)}") |
|
print(f" Multilingual papers found: {len(multilingual_papers)}") |
|
print(f" Percentage multilingual: {len(multilingual_papers)/len(all_papers)*100:.1f}%") |
|
|
|
|
|
all_keywords = [] |
|
for paper in multilingual_papers: |
|
all_keywords.extend(paper['keywords']) |
|
|
|
keyword_counts = {} |
|
for keyword in all_keywords: |
|
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1 |
|
|
|
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10] |
|
print("\nTop 10 keywords found:") |
|
for keyword, count in top_keywords: |
|
print(f" {keyword}: {count} papers") |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |