Spaces:

crystina-z
/

multilingual-paperbase

Running

multilingual-paperbase / add_conferences.py

Crystina

init

0a97af6 4 days ago

9.11 kB

	import re
	import json
	from collections import defaultdict


	# KNOWN_CONFERENCE_NAMES = ["COLING", "COLM", "EACL", "NAACL", "EMNLP", "AACL", "ACL"] # NOTE: NAACL and EACL need to come earlier than ACL
	CONFERENCE_NAME_TO_ABBR = {
	"Conference on Dependency Linguistics": "DepLing",
	"Conference on Language Modeling": "COLM",
	"European Chapter of the Association for Computational Linguistics": "EACL",
	"North American Chapter of the Association for Computational Linguistics": "NAACL",
	"Empirical Methods in Natural Language Processing": "EMNLP",
	"Association for Computational Linguistics": "ACL",
	"Annual Meeting of the Association for Computational Linguistics": "ACL",
	"International Workshop on Health Text Mining and Information Analysis": "LUOHI",
	"Conference on Computational Semantics": "IWCS",
	"Conference on Machine Translation": "WMT",
	"Conference Recent Advances in Natural Language Processing": "RANLP",
	"Conference on Computational Linguistics": "COLING",
	"Conference of Computational Linguistics": "NODALIDA",
	"Conference on Language Resources and Evaluation": "LREC",
	}

	UNNEEDED_DESCRIPTIONS = [
	"Shared Task",
	"Short Papers",
	"Poster Papers",
	"Poster",
	]

	KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values())

	def extract_conference_info():
	"""Extract unique conferences from conferences.txt and save to JSON"""

	# Read the conferences file
	with open('data/conferences.txt', 'r', encoding='utf-8') as f:
	content = f.read()

	# Split by lines and clean up
	all_lines = [line.strip() for line in content.split('\n') if line.strip()]
	lines = list(set(all_lines))

	# Dictionary to store unique conferences with their years
	conferences = defaultdict(set)
	abbr2count = defaultdict(int)

	for line in lines:
	# Remove leading/trailing braces and clean up
	_count = all_lines.count(line)

	line = line.strip(' \{\},')
	# line = line.replace("{", "")
	# line = line.replace("}", "")
	# line = line.strip()

	# Skip empty lines
	if not line:
	continue

	# Extract year from the conference name
	year_match = re.search(r'\b(19\|20)\d{2}\b', line)
	year = year_match.group() if year_match else None

	# Extract the base conference name (remove year and common suffixes)
	# Remove year from the name for grouping
	base_name = re.sub(r'\b(19\|20)\d{2}\b', '', line)

	# Remove common suffixes that don't affect the core conference name
	if base_name.startswith("Findings"):
	base_name = base_name.split(":")[-1].strip()
	else:
	base_name = re.sub(r'\s:\s.*$', '', base_name) # Remove everything after colon
	base_name = re.sub(r'\s--\s.*$', '', base_name) # Remove everything after double dash
	# remove everything within parentheses
	base_name = re.sub(r'\s$[^)]$\s*$', '', base_name) # Remove trailing parentheses

	base_name = re.sub(r'\s$[^)]$\s*$', '', base_name) # Remove trailing parentheses
	base_name = re.sub(r'\sVolume\s+\d+.$', '', base_name, flags=re.IGNORECASE) # Remove volume info
	base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE) # Remove "Proceedings of"

	# Remove ordinal numbers (1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, 11th, 12th, etc.)
	base_name = re.sub(r'\b\d+(?:st\|nd\|rd\|th)\s+', '', base_name, flags=re.IGNORECASE)
	base_name = base_name.replace("}", "")

	# Remove any words before the first occurrence of "Conference"
	conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE)
	if conference_match:
	start_pos = conference_match.start()
	base_name = base_name[start_pos:]

	# Remove "the First", "the Second", etc. from the beginning
	base_name = re.sub(r'^the\s+(?:First\|Second\|Third\|Fourth\|Fifth\|Sixth\|Seventh\|Eighth\|Ninth\|Tenth\|Eleventh\|Twelfth\|Thirteenth\|Fourteenth\|Fifteenth\|Sixteenth\|Seventeenth\|Eighteenth\|Nineteenth\|Twentieth\|Twenty-first\|Twenty-second\|Twenty-third\|Twenty-fourth\|Twenty-fifth\|Twenty-sixth\|Twenty-seventh\|Twenty-eighth\|Twenty-ninth\|Thirtieth\|Thirty-first\|Thirty-second\|Thirty-third\|Thirty-fourth\|Thirty-fifth\|Thirty-sixth\|Thirty-seventh\|Thirty-eighth\|Thirty-ninth\|Fortieth\|Forty-first\|Forty-second\|Forty-third\|Forty-fourth\|Forty-fifth\|Forty-sixth\|Forty-seventh\|Forty-eighth\|Forty-ninth\|Fiftieth\|Fifty-first\|Fifty-second\|Fifty-third\|Fifty-fourth\|Fifty-fifth\|Fifty-sixth\|Fifty-seventh\|Fifty-eighth\|Fifty-ninth\|Sixtieth\|Sixty-first\|Sixty-second\|Sixty-third\|Sixty-fourth\|Sixty-fifth\|Sixty-sixth\|Sixty-seventh\|Sixty-eighth\|Sixty-ninth\|Seventieth\|Seventy-first\|Seventy-second\|Seventy-third\|Seventy-fourth\|Seventy-fifth\|Seventy-sixth\|Seventy-seventh\|Seventy-eighth\|Seventy-ninth\|Eightieth\|Eighty-first\|Eighty-second\|Eighty-third\|Eighty-fourth\|Eighty-fifth\|Eighty-sixth\|Eighty-seventh\|Eighty-eighth\|Eighty-ninth\|Ninetieth\|Ninety-first\|Ninety-second\|Ninety-third\|Ninety-fourth\|Ninety-fifth\|Ninety-sixth\|Ninety-seventh\|Ninety-eighth\|Ninety-ninth\|Hundredth)\s+', '', base_name, flags=re.IGNORECASE)

	# Remove Roman numerals (I, II, III, IV, V, VI, VII, VIII, IX, X, XI, XII, XIII, XIV, XV, XVI, XVII, XVIII, XIX, XX, etc.)
	# This needs to happen BEFORE punctuation removal to catch Roman numerals properly
	# More comprehensive pattern to catch all Roman numerals
	base_name = re.sub(r'\b(?:I{1,3}\|IV\|VI{0,3}\|IX\|X{1,3}\|XI{0,3}\|XV\|XX{0,3}\|XXX{0,3}\|XL\|L\|LX{0,3}\|LXX{0,3}\|LXXX\|XC\|C\|CC{0,3}\|CD\|D\|DC{0,3}\|DCC{0,3}\|DCCC\|CM\|M{0,3})\b', '', base_name)

	# Also try a simpler approach - remove any sequence of I, V, X, L, C, D, M that looks like a Roman numeral
	base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name)

	# Replace punctuation with whitespace
	base_name = re.sub(r'[^\w\s]', ' ', base_name)

	# Replace all numbers with whitespace
	base_name = re.sub(r'\d+', ' ', base_name)

	# base_name = base_name.replace("Shared Task ", "")
	for unneeded_description in UNNEEDED_DESCRIPTIONS:
	base_name = base_name.replace(unneeded_description, "")

	for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items():
	if conf_name.lower() in base_name.lower():
	base_name = base_name.replace(conf_name, conf_abbr)
	break

	for conf in KNOWN_CONFERENCE_NAMES:
	if conf.lower() in base_name.lower():
	base_name = conf
	break

	if "de la" in base_name or " le " in base_name or base_name == "Conference":
	base_name = "Others"

	if "Multi lingual" in base_name:
	base_name = base_name.replace("Multi lingual", "Multilingual")

	# Clean up extra whitespace and consecutive whitespace
	base_name = re.sub(r'\s+', ' ', base_name).strip()

	# Skip if base name is too short
	# if len(base_name) < 5:
	# base_name = "Unknown"

	# Add to conferences dictionary
	# if year:
	# conferences[base_name].add(int(year))
	# else:
	# conferences[base_name].add(None)
	conferences[base_name].add(line)
	abbr2count[base_name] += _count

	# conf_abbr2keywords = {
	# "ACL": ["Association for Computational Linguistics"],
	# "EMNLP": ["Empirical Methods in Natural Language Processing"],
	# "NAACL": ["North American Chapter of the Association for Computational Linguistics"],
	# "EACL": ["European Chapter of the Association for Computational Linguistics"],
	# "COLM": ["Conference on Computational Linguistics"],
	# }

	# print(f"Found {len(conferences)} unique conferences from {len(lines)} lines")
	# for i, conf in enumerate(sorted(conferences.keys())):
	# print(f"{i+1}. {conf}")
	# if i > 200: break
	# import pdb; pdb.set_trace()

	conference_to_save = {}
	others = []
	for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)):
	ratio = count / len(all_lines)
	if ratio < 0.001 or conf == "Others":
	others.append((conf, count))
	continue

	conference_to_save[conf] = {
	"count": count,
	"conferences": tuple(conferences[conf]),
	}
	print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]")

	conference_to_save[f"Others ({len(others)} Venues)"] = {
	"count": sum(count for conf, count in others),
	"conferences": tuple(conf for conf, count in others),
	}

	# Save to JSON file
	with open('data/unique_conferences.json', 'w', encoding='utf-8') as f:
	json.dump(conference_to_save, f, indent=2, ensure_ascii=False)

	print(f"Extracted {len(conference_to_save)} unique conferences")
	print(f"Saved to data/unique_conferences.json")

	if __name__ == "__main__":
	extract_conference_info()