multilingual-paperbase / add_conferences.py
Crystina
init
0a97af6
import re
import json
from collections import defaultdict
# KNOWN_CONFERENCE_NAMES = ["COLING", "COLM", "EACL", "NAACL", "EMNLP", "AACL", "ACL"] # NOTE: NAACL and EACL need to come earlier than ACL
CONFERENCE_NAME_TO_ABBR = {
"Conference on Dependency Linguistics": "DepLing",
"Conference on Language Modeling": "COLM",
"European Chapter of the Association for Computational Linguistics": "EACL",
"North American Chapter of the Association for Computational Linguistics": "NAACL",
"Empirical Methods in Natural Language Processing": "EMNLP",
"Association for Computational Linguistics": "ACL",
"Annual Meeting of the Association for Computational Linguistics": "ACL",
"International Workshop on Health Text Mining and Information Analysis": "LUOHI",
"Conference on Computational Semantics": "IWCS",
"Conference on Machine Translation": "WMT",
"Conference Recent Advances in Natural Language Processing": "RANLP",
"Conference on Computational Linguistics": "COLING",
"Conference of Computational Linguistics": "NODALIDA",
"Conference on Language Resources and Evaluation": "LREC",
}
UNNEEDED_DESCRIPTIONS = [
"Shared Task",
"Short Papers",
"Poster Papers",
"Poster",
]
KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values())
def extract_conference_info():
"""Extract unique conferences from conferences.txt and save to JSON"""
# Read the conferences file
with open('data/conferences.txt', 'r', encoding='utf-8') as f:
content = f.read()
# Split by lines and clean up
all_lines = [line.strip() for line in content.split('\n') if line.strip()]
lines = list(set(all_lines))
# Dictionary to store unique conferences with their years
conferences = defaultdict(set)
abbr2count = defaultdict(int)
for line in lines:
# Remove leading/trailing braces and clean up
_count = all_lines.count(line)
line = line.strip(' \{\},')
# line = line.replace("{", "")
# line = line.replace("}", "")
# line = line.strip()
# Skip empty lines
if not line:
continue
# Extract year from the conference name
year_match = re.search(r'\b(19|20)\d{2}\b', line)
year = year_match.group() if year_match else None
# Extract the base conference name (remove year and common suffixes)
# Remove year from the name for grouping
base_name = re.sub(r'\b(19|20)\d{2}\b', '', line)
# Remove common suffixes that don't affect the core conference name
if base_name.startswith("Findings"):
base_name = base_name.split(":")[-1].strip()
else:
base_name = re.sub(r'\s*:\s*.*$', '', base_name) # Remove everything after colon
base_name = re.sub(r'\s*--\s*.*$', '', base_name) # Remove everything after double dash
# remove everything within parentheses
base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) # Remove trailing parentheses
base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) # Remove trailing parentheses
base_name = re.sub(r'\s*Volume\s+\d+.*$', '', base_name, flags=re.IGNORECASE) # Remove volume info
base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE) # Remove "Proceedings of"
# Remove ordinal numbers (1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, 11th, 12th, etc.)
base_name = re.sub(r'\b\d+(?:st|nd|rd|th)\s+', '', base_name, flags=re.IGNORECASE)
base_name = base_name.replace("}", "")
# Remove any words before the first occurrence of "Conference"
conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE)
if conference_match:
start_pos = conference_match.start()
base_name = base_name[start_pos:]
# Remove "the First", "the Second", etc. from the beginning
base_name = re.sub(r'^the\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|Fifteenth|Sixteenth|Seventeenth|Eighteenth|Nineteenth|Twentieth|Twenty-first|Twenty-second|Twenty-third|Twenty-fourth|Twenty-fifth|Twenty-sixth|Twenty-seventh|Twenty-eighth|Twenty-ninth|Thirtieth|Thirty-first|Thirty-second|Thirty-third|Thirty-fourth|Thirty-fifth|Thirty-sixth|Thirty-seventh|Thirty-eighth|Thirty-ninth|Fortieth|Forty-first|Forty-second|Forty-third|Forty-fourth|Forty-fifth|Forty-sixth|Forty-seventh|Forty-eighth|Forty-ninth|Fiftieth|Fifty-first|Fifty-second|Fifty-third|Fifty-fourth|Fifty-fifth|Fifty-sixth|Fifty-seventh|Fifty-eighth|Fifty-ninth|Sixtieth|Sixty-first|Sixty-second|Sixty-third|Sixty-fourth|Sixty-fifth|Sixty-sixth|Sixty-seventh|Sixty-eighth|Sixty-ninth|Seventieth|Seventy-first|Seventy-second|Seventy-third|Seventy-fourth|Seventy-fifth|Seventy-sixth|Seventy-seventh|Seventy-eighth|Seventy-ninth|Eightieth|Eighty-first|Eighty-second|Eighty-third|Eighty-fourth|Eighty-fifth|Eighty-sixth|Eighty-seventh|Eighty-eighth|Eighty-ninth|Ninetieth|Ninety-first|Ninety-second|Ninety-third|Ninety-fourth|Ninety-fifth|Ninety-sixth|Ninety-seventh|Ninety-eighth|Ninety-ninth|Hundredth)\s+', '', base_name, flags=re.IGNORECASE)
# Remove Roman numerals (I, II, III, IV, V, VI, VII, VIII, IX, X, XI, XII, XIII, XIV, XV, XVI, XVII, XVIII, XIX, XX, etc.)
# This needs to happen BEFORE punctuation removal to catch Roman numerals properly
# More comprehensive pattern to catch all Roman numerals
base_name = re.sub(r'\b(?:I{1,3}|IV|VI{0,3}|IX|X{1,3}|XI{0,3}|XV|XX{0,3}|XXX{0,3}|XL|L|LX{0,3}|LXX{0,3}|LXXX|XC|C|CC{0,3}|CD|D|DC{0,3}|DCC{0,3}|DCCC|CM|M{0,3})\b', '', base_name)
# Also try a simpler approach - remove any sequence of I, V, X, L, C, D, M that looks like a Roman numeral
base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name)
# Replace punctuation with whitespace
base_name = re.sub(r'[^\w\s]', ' ', base_name)
# Replace all numbers with whitespace
base_name = re.sub(r'\d+', ' ', base_name)
# base_name = base_name.replace("Shared Task ", "")
for unneeded_description in UNNEEDED_DESCRIPTIONS:
base_name = base_name.replace(unneeded_description, "")
for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items():
if conf_name.lower() in base_name.lower():
base_name = base_name.replace(conf_name, conf_abbr)
break
for conf in KNOWN_CONFERENCE_NAMES:
if conf.lower() in base_name.lower():
base_name = conf
break
if "de la" in base_name or " le " in base_name or base_name == "Conference":
base_name = "Others"
if "Multi lingual" in base_name:
base_name = base_name.replace("Multi lingual", "Multilingual")
# Clean up extra whitespace and consecutive whitespace
base_name = re.sub(r'\s+', ' ', base_name).strip()
# Skip if base name is too short
# if len(base_name) < 5:
# base_name = "Unknown"
# Add to conferences dictionary
# if year:
# conferences[base_name].add(int(year))
# else:
# conferences[base_name].add(None)
conferences[base_name].add(line)
abbr2count[base_name] += _count
# conf_abbr2keywords = {
# "ACL": ["Association for Computational Linguistics"],
# "EMNLP": ["Empirical Methods in Natural Language Processing"],
# "NAACL": ["North American Chapter of the Association for Computational Linguistics"],
# "EACL": ["European Chapter of the Association for Computational Linguistics"],
# "COLM": ["Conference on Computational Linguistics"],
# }
# print(f"Found {len(conferences)} unique conferences from {len(lines)} lines")
# for i, conf in enumerate(sorted(conferences.keys())):
# print(f"{i+1}. {conf}")
# if i > 200: break
# import pdb; pdb.set_trace()
conference_to_save = {}
others = []
for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)):
ratio = count / len(all_lines)
if ratio < 0.001 or conf == "Others":
others.append((conf, count))
continue
conference_to_save[conf] = {
"count": count,
"conferences": tuple(conferences[conf]),
}
print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]")
conference_to_save[f"Others ({len(others)} Venues)"] = {
"count": sum(count for conf, count in others),
"conferences": tuple(conf for conf, count in others),
}
# Save to JSON file
with open('data/unique_conferences.json', 'w', encoding='utf-8') as f:
json.dump(conference_to_save, f, indent=2, ensure_ascii=False)
print(f"Extracted {len(conference_to_save)} unique conferences")
print(f"Saved to data/unique_conferences.json")
if __name__ == "__main__":
extract_conference_info()