|
import re |
|
import json |
|
from collections import defaultdict |
|
|
|
|
|
|
|
CONFERENCE_NAME_TO_ABBR = { |
|
"Conference on Dependency Linguistics": "DepLing", |
|
"Conference on Language Modeling": "COLM", |
|
"European Chapter of the Association for Computational Linguistics": "EACL", |
|
"North American Chapter of the Association for Computational Linguistics": "NAACL", |
|
"Empirical Methods in Natural Language Processing": "EMNLP", |
|
"Association for Computational Linguistics": "ACL", |
|
"Annual Meeting of the Association for Computational Linguistics": "ACL", |
|
"International Workshop on Health Text Mining and Information Analysis": "LUOHI", |
|
"Conference on Computational Semantics": "IWCS", |
|
"Conference on Machine Translation": "WMT", |
|
"Conference Recent Advances in Natural Language Processing": "RANLP", |
|
"Conference on Computational Linguistics": "COLING", |
|
"Conference of Computational Linguistics": "NODALIDA", |
|
"Conference on Language Resources and Evaluation": "LREC", |
|
} |
|
|
|
UNNEEDED_DESCRIPTIONS = [ |
|
"Shared Task", |
|
"Short Papers", |
|
"Poster Papers", |
|
"Poster", |
|
] |
|
|
|
KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values()) |
|
|
|
def extract_conference_info(): |
|
"""Extract unique conferences from conferences.txt and save to JSON""" |
|
|
|
|
|
with open('data/conferences.txt', 'r', encoding='utf-8') as f: |
|
content = f.read() |
|
|
|
|
|
all_lines = [line.strip() for line in content.split('\n') if line.strip()] |
|
lines = list(set(all_lines)) |
|
|
|
|
|
conferences = defaultdict(set) |
|
abbr2count = defaultdict(int) |
|
|
|
for line in lines: |
|
|
|
_count = all_lines.count(line) |
|
|
|
line = line.strip(' \{\},') |
|
|
|
|
|
|
|
|
|
|
|
if not line: |
|
continue |
|
|
|
|
|
year_match = re.search(r'\b(19|20)\d{2}\b', line) |
|
year = year_match.group() if year_match else None |
|
|
|
|
|
|
|
base_name = re.sub(r'\b(19|20)\d{2}\b', '', line) |
|
|
|
|
|
if base_name.startswith("Findings"): |
|
base_name = base_name.split(":")[-1].strip() |
|
else: |
|
base_name = re.sub(r'\s*:\s*.*$', '', base_name) |
|
base_name = re.sub(r'\s*--\s*.*$', '', base_name) |
|
|
|
base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) |
|
|
|
base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) |
|
base_name = re.sub(r'\s*Volume\s+\d+.*$', '', base_name, flags=re.IGNORECASE) |
|
base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE) |
|
|
|
|
|
base_name = re.sub(r'\b\d+(?:st|nd|rd|th)\s+', '', base_name, flags=re.IGNORECASE) |
|
base_name = base_name.replace("}", "") |
|
|
|
|
|
conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE) |
|
if conference_match: |
|
start_pos = conference_match.start() |
|
base_name = base_name[start_pos:] |
|
|
|
|
|
base_name = re.sub(r'^the\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|Fifteenth|Sixteenth|Seventeenth|Eighteenth|Nineteenth|Twentieth|Twenty-first|Twenty-second|Twenty-third|Twenty-fourth|Twenty-fifth|Twenty-sixth|Twenty-seventh|Twenty-eighth|Twenty-ninth|Thirtieth|Thirty-first|Thirty-second|Thirty-third|Thirty-fourth|Thirty-fifth|Thirty-sixth|Thirty-seventh|Thirty-eighth|Thirty-ninth|Fortieth|Forty-first|Forty-second|Forty-third|Forty-fourth|Forty-fifth|Forty-sixth|Forty-seventh|Forty-eighth|Forty-ninth|Fiftieth|Fifty-first|Fifty-second|Fifty-third|Fifty-fourth|Fifty-fifth|Fifty-sixth|Fifty-seventh|Fifty-eighth|Fifty-ninth|Sixtieth|Sixty-first|Sixty-second|Sixty-third|Sixty-fourth|Sixty-fifth|Sixty-sixth|Sixty-seventh|Sixty-eighth|Sixty-ninth|Seventieth|Seventy-first|Seventy-second|Seventy-third|Seventy-fourth|Seventy-fifth|Seventy-sixth|Seventy-seventh|Seventy-eighth|Seventy-ninth|Eightieth|Eighty-first|Eighty-second|Eighty-third|Eighty-fourth|Eighty-fifth|Eighty-sixth|Eighty-seventh|Eighty-eighth|Eighty-ninth|Ninetieth|Ninety-first|Ninety-second|Ninety-third|Ninety-fourth|Ninety-fifth|Ninety-sixth|Ninety-seventh|Ninety-eighth|Ninety-ninth|Hundredth)\s+', '', base_name, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
|
base_name = re.sub(r'\b(?:I{1,3}|IV|VI{0,3}|IX|X{1,3}|XI{0,3}|XV|XX{0,3}|XXX{0,3}|XL|L|LX{0,3}|LXX{0,3}|LXXX|XC|C|CC{0,3}|CD|D|DC{0,3}|DCC{0,3}|DCCC|CM|M{0,3})\b', '', base_name) |
|
|
|
|
|
base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name) |
|
|
|
|
|
base_name = re.sub(r'[^\w\s]', ' ', base_name) |
|
|
|
|
|
base_name = re.sub(r'\d+', ' ', base_name) |
|
|
|
|
|
for unneeded_description in UNNEEDED_DESCRIPTIONS: |
|
base_name = base_name.replace(unneeded_description, "") |
|
|
|
for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items(): |
|
if conf_name.lower() in base_name.lower(): |
|
base_name = base_name.replace(conf_name, conf_abbr) |
|
break |
|
|
|
for conf in KNOWN_CONFERENCE_NAMES: |
|
if conf.lower() in base_name.lower(): |
|
base_name = conf |
|
break |
|
|
|
if "de la" in base_name or " le " in base_name or base_name == "Conference": |
|
base_name = "Others" |
|
|
|
if "Multi lingual" in base_name: |
|
base_name = base_name.replace("Multi lingual", "Multilingual") |
|
|
|
|
|
base_name = re.sub(r'\s+', ' ', base_name).strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
conferences[base_name].add(line) |
|
abbr2count[base_name] += _count |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
conference_to_save = {} |
|
others = [] |
|
for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)): |
|
ratio = count / len(all_lines) |
|
if ratio < 0.001 or conf == "Others": |
|
others.append((conf, count)) |
|
continue |
|
|
|
conference_to_save[conf] = { |
|
"count": count, |
|
"conferences": tuple(conferences[conf]), |
|
} |
|
print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]") |
|
|
|
conference_to_save[f"Others ({len(others)} Venues)"] = { |
|
"count": sum(count for conf, count in others), |
|
"conferences": tuple(conf for conf, count in others), |
|
} |
|
|
|
|
|
with open('data/unique_conferences.json', 'w', encoding='utf-8') as f: |
|
json.dump(conference_to_save, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"Extracted {len(conference_to_save)} unique conferences") |
|
print(f"Saved to data/unique_conferences.json") |
|
|
|
if __name__ == "__main__": |
|
extract_conference_info() |
|
|