|
|
|
|
|
""" |
|
DOCUMENTATION: This script performs vectorization and topic inference using Mallet models. It accepts a raw JSONL file, |
|
identifies the language of the text, and applies the corresponding Mallet model for topic inference. It also supports |
|
other input formats through a flexible InputReader abstraction (e.g., CSV, JSONL). |
|
|
|
The benefit of this script with respect to the Mallet CLI is that it can handle |
|
multiple languages in a single run without calling Mallet multiple times. |
|
|
|
Classes: |
|
- MalletVectorizer: Handles text-to-Mallet vectorization. |
|
- LanguageInferencer: Performs topic inference using a Mallet inferencer and |
|
the vectorizer. |
|
- InputReader (abstract class): Defines the interface for reading input |
|
documents. |
|
- JsonlInputReader: Reads input from JSONL files. |
|
- CsvInputReader: Reads input from CSV files (Mallet format). |
|
- MalletTopicInferencer: Coordinates the process, identifies language, and manages |
|
inference. |
|
|
|
USAGE: |
|
python mallet_topic_inferencer.py --input input.jsonl --output output.txt |
|
--logfile logfile.log --input-format jsonl --level INFO --num_iterations 1000 |
|
--languages de,en --de_inferencer models/de.inferencer --de_pipe models/de.pipe |
|
""" |
|
|
|
import collections |
|
import traceback |
|
import jpype |
|
import jpype.imports |
|
|
|
import spacy |
|
|
|
|
|
import os |
|
import logging |
|
import argparse |
|
import json |
|
import csv |
|
import tempfile |
|
from typing import List, Dict, Generator, Tuple, Optional, Set |
|
from abc import ABC, abstractmethod |
|
import mallet2topic_assignment_jsonl as m2taj |
|
from smart_open import open |
|
|
|
log = logging.getLogger(__name__) |
|
|
|
|
|
def save_text_as_csv(text: str) -> str: |
|
""" |
|
Save the given text as a temporary CSV file with an arbitrary ID and return the file name. |
|
|
|
Args: |
|
text (str): The text to be saved in the CSV file. |
|
|
|
Returns: |
|
str: The name of the temporary CSV file. |
|
""" |
|
|
|
temp_csv_file = tempfile.NamedTemporaryFile( |
|
delete=False, mode="w", suffix=".csv", newline="", encoding="utf-8" |
|
) |
|
|
|
|
|
csv_writer = csv.writer(temp_csv_file, delimiter="\t") |
|
csv_writer.writerow(["ID", "DUMMYCLASS", "TEXT"]) |
|
csv_writer.writerow(["USERINPUT-2024-10-24-a-i0042", "dummy_class", text]) |
|
|
|
|
|
temp_csv_file.close() |
|
|
|
return temp_csv_file.name |
|
|
|
|
|
class Lemmatizer: |
|
def __init__( |
|
self, |
|
languages_dict: Dict[str, str], |
|
lang_lemmatization_dict: Dict[str, Dict[str, str]], |
|
): |
|
""" |
|
Initializes the linguistic lemmatizer with specified languages and lemmatization dictionary. |
|
|
|
Args: |
|
languages (List[str]): List of language codes to load processing pipelines for. |
|
lemmatization_dict (Dict[str, str]): Dictionary mapping tokens to their lemmas. |
|
""" |
|
self.languages_dict = languages_dict |
|
self.lemmatization_dict = lang_lemmatization_dict |
|
self.language_processors = self._load_language_processors(languages_dict) |
|
|
|
def _load_language_processors( |
|
self, languages_dict |
|
) -> Dict[str, spacy.language.Language]: |
|
""" |
|
Loads spacy language processors for the specified languages. |
|
|
|
Returns: |
|
Dict[str, spacy.language.Language]: Dictionary mapping language codes to spacy NLP pipelines. |
|
""" |
|
|
|
processors = {} |
|
for lang in languages_dict: |
|
processors[lang] = spacy.load( |
|
languages_dict[lang], disable=["parser", "ner"] |
|
) |
|
processors[lang].add_pipe("sentencizer") |
|
return processors |
|
|
|
def analyze_text(self, text: str, lang: str) -> List[str]: |
|
""" |
|
Analyzes text, performing tokenization, POS tagging, and lemma mapping. |
|
|
|
Args: |
|
text (str): Text to process. |
|
lang (str): Language code for the text. |
|
|
|
Returns: |
|
List[str]: List of tokens that have matching entries in the lemmatization dictionary. |
|
""" |
|
if lang not in self.language_processors: |
|
raise ValueError(f"No processing pipeline for language '{lang}'") |
|
|
|
nlp = self.language_processors[lang] |
|
doc = nlp(text) |
|
token2lemma = self.lemmatization_dict[lang] |
|
matched_tokens = [ |
|
lemma for tok in doc if (lemma := token2lemma.get(tok.text.lower())) |
|
] |
|
return matched_tokens |
|
|
|
|
|
|
|
|
|
|
|
class MalletVectorizer: |
|
""" |
|
Handles the vectorization of multiple documents into a format usable by Mallet using the pipe file from the model. |
|
""" |
|
|
|
def __init__(self, language: str, pipe_file: str) -> None: |
|
|
|
|
|
from cc.mallet.classify.tui import Csv2Vectors |
|
|
|
self.vectorizer = Csv2Vectors() |
|
|
|
self.pipe_file = pipe_file |
|
self.language = language |
|
|
|
def run_csv2vectors( |
|
self, |
|
input_file: str, |
|
output_file: Optional[str] = None, |
|
delete_input_file_after: bool = True, |
|
) -> str: |
|
""" |
|
Run Csv2Vectors to vectorize the input file. |
|
|
|
Simple java-internal command line interface to the Csv2Vectors class in Mallet. |
|
|
|
Args: |
|
input_file: Path to the csv input file to be vectorized. |
|
output_file: Path where the output .mallet file should be saved. |
|
""" |
|
|
|
if not output_file: |
|
output_file = input_file + ".mallet" |
|
|
|
|
|
arguments = [ |
|
"--input", |
|
input_file, |
|
"--output", |
|
output_file, |
|
"--keep-sequence", |
|
"--encoding", |
|
"UTF-8", |
|
"--use-pipe-from", |
|
self.pipe_file, |
|
] |
|
|
|
logging.info("Calling mallet Csv2Vector: %s", arguments) |
|
self.vectorizer.main(arguments) |
|
|
|
logging.debug("Csv2Vector call finished.") |
|
if log.getEffectiveLevel() != logging.DEBUG and delete_input_file_after: |
|
os.remove(input_file) |
|
logging.info("Cleaning up input file: %s", input_file) |
|
return output_file |
|
|
|
|
|
class LanguageInferencer: |
|
""" |
|
A class to manage Mallet inferencing for a specific language. |
|
Loads the inferencer and pipe file during initialization. |
|
""" |
|
|
|
def __init__(self, language: str, inferencer_file: str, pipe_file: str) -> None: |
|
|
|
|
|
from cc.mallet.topics.tui import InferTopics |
|
|
|
self.language = language |
|
self.inferencer_file = inferencer_file |
|
self.inferencer = InferTopics() |
|
self.pipe_file = pipe_file |
|
self.vectorizer = MalletVectorizer(language=language, pipe_file=self.pipe_file) |
|
|
|
if not os.path.exists(self.inferencer_file): |
|
raise FileNotFoundError( |
|
f"Inferencer file not found: {self.inferencer_file}" |
|
) |
|
|
|
def run_csv2topics( |
|
self, csv_file: str, delete_mallet_file_after: bool = True |
|
) -> Dict[str, str]: |
|
""" |
|
Perform topic inference on a single input file. |
|
The input file should be in the format expected by Mallet. |
|
Returns a dictionary of document_id -> topic distributions. |
|
""" |
|
|
|
|
|
|
|
mallet_file = self.vectorizer.run_csv2vectors(csv_file) |
|
|
|
topics_file = mallet_file + ".doctopics" |
|
|
|
arguments = [ |
|
"--input", |
|
mallet_file, |
|
"--inferencer", |
|
self.inferencer_file, |
|
"--output-doc-topics", |
|
topics_file, |
|
"--random-seed", |
|
"42", |
|
] |
|
|
|
logging.info("Calling mallet InferTopics: %s", arguments) |
|
|
|
self.inferencer.main(arguments) |
|
logging.debug("InferTopics call finished.") |
|
|
|
if log.getEffectiveLevel() != logging.DEBUG and delete_mallet_file_after: |
|
os.remove(mallet_file) |
|
logging.info("Cleaning up input file: %s", mallet_file) |
|
|
|
return topics_file |
|
|
|
|
|
|
|
|
|
|
|
class InputReader(ABC): |
|
""" |
|
Abstract base class for input readers. |
|
Subclasses should implement the `read_documents` method to yield documents. |
|
""" |
|
|
|
@abstractmethod |
|
def read_documents(self) -> Generator[Tuple[str, str], None, None]: |
|
""" |
|
Yields a tuple of (document_id, text). |
|
Each implementation should handle its specific input format. |
|
""" |
|
pass |
|
|
|
|
|
class JsonlInputReader(InputReader): |
|
""" |
|
Reads input from a JSONL file, where each line contains a JSON object |
|
with at least "id" and "text" fields. |
|
""" |
|
|
|
def __init__(self, input_file: str) -> None: |
|
self.input_file = input_file |
|
|
|
def read_documents(self) -> Generator[Tuple[str, str], None, None]: |
|
with open(self.input_file, "r", encoding="utf-8") as f: |
|
for line in f: |
|
data = json.loads(line) |
|
document_id = data.get("id", "unknown_id") |
|
text = data.get("text", "") |
|
yield document_id, text |
|
|
|
|
|
class CsvInputReader(InputReader): |
|
""" |
|
Reads input from a CSV file in Mallet's format (document ID, dummy class, text). |
|
Assumes that the CSV has three columns: "id", "dummyclass", and "text". |
|
""" |
|
|
|
def __init__(self, input_file: str) -> None: |
|
self.input_file = input_file |
|
|
|
def read_documents(self) -> Generator[Tuple[str, str], None, None]: |
|
with open(self.input_file, mode="r", encoding="utf-8") as f: |
|
csv_reader = csv.reader(f, delimiter="\t") |
|
for row in csv_reader: |
|
if len(row) < 3: |
|
continue |
|
document_id, text = row[0], row[2] |
|
yield document_id, text.lower() |
|
|
|
|
|
|
|
|
|
|
|
class MalletTopicInferencer: |
|
""" |
|
MalletTopicInferencer class coordinates the process of reading input documents, identifying their language, and performing topic inference using Mallet models. |
|
""" |
|
|
|
def __init__(self, args: argparse.Namespace) -> None: |
|
self.args = args |
|
self.languages = set(args.languages) |
|
self.language_inferencers: Optional[Dict[str, LanguageInferencer]] = None |
|
self.language_lemmatizations: Optional[Dict[str, Dict[str, str]]] = None |
|
self.input_reader = None |
|
self.inference_results: List[Dict[str, str]] = [] |
|
self.language_dict: Dict[str, str] = {} |
|
self.seen_languages: Set[str] = set() |
|
self.stats = collections.Counter() |
|
|
|
def initialize(self) -> None: |
|
"""Initialize the inferencers after JVM startup.""" |
|
self.language_inferencers = self.init_language_inferencers(self.args) |
|
self.input_reader = self.build_input_reader(self.args) |
|
self.language_lemmatizations = self.init_language_lemmatizations(self.args) |
|
if self.args.language_file: |
|
self.language_dict = self.read_language_file(self.args.language_file) |
|
|
|
@staticmethod |
|
def start_jvm() -> None: |
|
"""Start the Java Virtual Machine if not already started.""" |
|
if not jpype.isJVMStarted(): |
|
current_dir = os.getcwd() |
|
source_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
|
classpath = [ |
|
os.path.join(current_dir, "mallet/lib/mallet-deps.jar"), |
|
os.path.join(current_dir, "mallet/lib/mallet.jar"), |
|
] |
|
|
|
|
|
if not all(os.path.exists(path) for path in classpath): |
|
|
|
classpath = [ |
|
os.path.join(source_dir, "mallet/lib/mallet-deps.jar"), |
|
os.path.join(source_dir, "mallet/lib/mallet.jar"), |
|
] |
|
|
|
jpype.startJVM(classpath=classpath) |
|
log.info(f"JVM started successfully with classpath {classpath}.") |
|
else: |
|
log.warning("JVM already running.") |
|
|
|
def read_language_file(self, language_file: str) -> Dict[str, str]: |
|
"""Read the language file (JSONL) and return a dictionary of document_id -> language.""" |
|
|
|
language_dict = {} |
|
with open(language_file, "r", encoding="utf-8") as f: |
|
for line in f: |
|
data = json.loads(line) |
|
doc_id = data.get("doc_id") |
|
language = data.get("language") |
|
if doc_id and language: |
|
language_dict[doc_id] = language |
|
return language_dict |
|
|
|
@staticmethod |
|
def load_lemmatization_file( |
|
lemmatization_file_path: str, |
|
bidi: bool = False, |
|
lowercase: bool = True, |
|
ignore_pos: bool = True, |
|
) -> Dict[str, str]: |
|
""" |
|
Load lemmatization data from the file. |
|
:param lemmatization_file_path: Path to the lemmatization file. |
|
:return: A dictionary mapping tokens to their corresponding lemmas. |
|
""" |
|
|
|
token2lemma = {} |
|
n = 0 |
|
with open(lemmatization_file_path, "r", "utf-8") as file: |
|
for line in file: |
|
token, _, lemma = line.strip().split("\t") |
|
if lowercase: |
|
token2lemma[token.lower()] = lemma.lower() |
|
else: |
|
token2lemma[token] = lemma |
|
n += 1 |
|
|
|
logging.info( |
|
"Read %d lemmatization entries from %s", n, lemmatization_file_path |
|
) |
|
return token2lemma |
|
|
|
def init_language_lemmatizations( |
|
self, args: argparse.Namespace |
|
) -> Dict[str, Dict[str, str]]: |
|
"""Build a mapping of languages to their respective lemmatization dictionaries.""" |
|
|
|
language_lemmatizations: Dict[str, Dict[str, str]] = {} |
|
for language in args.languages: |
|
lemmatization_key = f"{language}_lemmatization" |
|
if getattr(args, lemmatization_key, None): |
|
lemmatization_file = getattr(args, lemmatization_key) |
|
language_lemmatizations[language] = self.load_lemmatization_file( |
|
lemmatization_file |
|
) |
|
else: |
|
log.info( |
|
f"Lemmatization file for language: {language} not provided by" |
|
" arguments. Skipping." |
|
) |
|
return language_lemmatizations |
|
|
|
def identify_language(self, document_id: str, text: str) -> str: |
|
"""Identify the language of the text using the language file or a dummy method.""" |
|
|
|
if document_id in self.language_dict: |
|
return self.language_dict[document_id] |
|
|
|
return "de" |
|
|
|
def init_language_inferencers( |
|
self, args: argparse.Namespace |
|
) -> Dict[str, LanguageInferencer]: |
|
"""Build a mapping of languages to their respective inferencers |
|
|
|
Includes the vectorizer pipe for each language as well. |
|
""" |
|
|
|
language_inferencers: Dict[str, LanguageInferencer] = {} |
|
for language in args.languages: |
|
inferencer_key = f"{language}_inferencer" |
|
pipe_key = f"{language}_pipe" |
|
if getattr(args, inferencer_key, None) and getattr(args, pipe_key, None): |
|
language_inferencers[language] = LanguageInferencer( |
|
language=language, |
|
inferencer_file=getattr(args, inferencer_key), |
|
pipe_file=getattr(args, pipe_key), |
|
) |
|
else: |
|
log.info( |
|
f"Inferencer or pipe file for language: {language} not provided by" |
|
" arguments. Skipping." |
|
) |
|
return language_inferencers |
|
|
|
def build_input_reader(self, args: argparse.Namespace) -> InputReader: |
|
"""Select the appropriate input reader based on the input format.""" |
|
if args.input_format == "jsonl": |
|
return JsonlInputReader(args.input) |
|
elif args.input_format == "csv": |
|
return CsvInputReader(args.input) |
|
else: |
|
raise ValueError(f"Unsupported input format: {args.input_format}") |
|
|
|
def process_input_file(self) -> None: |
|
"""Process the input file, identify language, and apply the appropriate Mallet model""" |
|
temp_files_by_language = self.write_language_specific_csv_files() |
|
|
|
doctopics_files = self.run_topic_inference(temp_files_by_language) |
|
logging.info(doctopics_files) |
|
if self.args.output_format == "csv": |
|
self.merge_inference_results(doctopics_files) |
|
elif self.args.output_format == "jsonl": |
|
self.merge_inference_results_jsonl(doctopics_files) |
|
|
|
def merge_inference_results_jsonl(self, doctopics_files_by_language): |
|
|
|
args = ["--output", "<generator>"] |
|
m2ta_converters = {} |
|
for lang, doctopics_file in doctopics_files_by_language.items(): |
|
topic_model_id = self.args.__dict__[f"{lang}_model_id"] |
|
if "{lang}" in topic_model_id: |
|
topic_model_id.format(lang=lang) |
|
args += [ |
|
"--topic_model", |
|
topic_model_id, |
|
"--topic_count", |
|
str(self.args.__dict__[f"{lang}_topic_count"]), |
|
"--lang", |
|
lang, |
|
doctopics_file, |
|
] |
|
m2ta_converters[lang] = m2taj.Mallet2TopicAssignment.main(args) |
|
for lang, m2ta_converter in m2ta_converters.items(): |
|
with open(self.args.output, "w", encoding="utf-8") as out_f: |
|
for row in m2ta_converter: |
|
self.stats["content_items"] += 1 |
|
print( |
|
json.dumps(row, ensure_ascii=False, separators=(",", ":")), |
|
file=out_f, |
|
) |
|
|
|
def merge_inference_results( |
|
self, doctopics_files_by_language: Dict[str, str] |
|
) -> None: |
|
"""Merge the inference results from multiple languages into a single output file.""" |
|
|
|
logging.info( |
|
"Saving CSV inference results into file %s from multiple languages: %s", |
|
self.args.output, |
|
doctopics_files_by_language, |
|
) |
|
with open(self.args.output, "w", encoding="utf-8") as out_f: |
|
for language, doctopics_file in doctopics_files_by_language.items(): |
|
with open(doctopics_file, "r", encoding="utf-8") as f: |
|
for line in f: |
|
if line.startswith("#"): |
|
continue |
|
doc_id, topic_dist = line.strip().split("\t", 1) |
|
print( |
|
doc_id + "__" + language, |
|
topic_dist, |
|
sep="\t", |
|
end="\n", |
|
file=out_f, |
|
) |
|
|
|
def write_language_specific_csv_files(self) -> Dict[str, str]: |
|
"""Read documents and write to language-specific temporary files""" |
|
tsv_files_by_language = {} |
|
|
|
for document_id, text in self.input_reader.read_documents(): |
|
language_code = self.identify_language(document_id, text) |
|
self.stats["LANGUAGE: " + language_code] += 1 |
|
if language_code not in self.languages: |
|
continue |
|
|
|
if language_code not in tsv_files_by_language: |
|
tsv_files_by_language[language_code] = tempfile.NamedTemporaryFile( |
|
delete=False, |
|
mode="w", |
|
suffix=f".{language_code}.tsv", |
|
encoding="utf-8", |
|
) |
|
logging.info( |
|
"Writing documents for language: %s in temp file: %s", |
|
language_code, |
|
tsv_files_by_language[language_code].name, |
|
) |
|
|
|
print( |
|
document_id, |
|
language_code, |
|
text, |
|
sep="\t", |
|
end="\n", |
|
file=tsv_files_by_language[language_code], |
|
) |
|
|
|
|
|
for temp_file in tsv_files_by_language.values(): |
|
temp_file.close() |
|
|
|
|
|
result = { |
|
lang: temp_file.name for lang, temp_file in tsv_files_by_language.items() |
|
} |
|
return result |
|
|
|
def run_topic_inference( |
|
self, language_specific_csv_files: Dict[str, str] |
|
) -> Dict[str, str]: |
|
"""Run inference for each language""" |
|
doctopics_files_by_language = {} |
|
for language_code, csv_file in language_specific_csv_files.items(): |
|
inferencer = self.language_inferencers.get(language_code) |
|
if not inferencer: |
|
log.error(f"No inferencer found for language: {language_code}") |
|
continue |
|
|
|
doctopics_file = inferencer.run_csv2topics(csv_file) |
|
doctopics_files_by_language[language_code] = doctopics_file |
|
|
|
|
|
if log.getEffectiveLevel() != logging.DEBUG: |
|
logging.info("Cleaning language specific csv file: %s", csv_file) |
|
os.remove(csv_file) |
|
logging.debug("Resulting doctopic files: %s", doctopics_files_by_language) |
|
return doctopics_files_by_language |
|
|
|
def write_results_to_output(self) -> None: |
|
"""Write the final merged inference results to the output file.""" |
|
with open(self.args.output, "w", encoding="utf-8") as out_file: |
|
for result in self.inference_results: |
|
out_file.write(json.dumps(result) + "\n") |
|
log.info(f"All inferences merged and written to {self.args.output}") |
|
|
|
def run(self) -> None: |
|
"""Main execution method.""" |
|
try: |
|
self.start_jvm() |
|
self.initialize() |
|
self.process_input_file() |
|
|
|
except Exception as e: |
|
log.error(f"An error occurred: {e}") |
|
log.error("Traceback: %s", traceback.format_exc()) |
|
finally: |
|
jpype.shutdownJVM() |
|
log.info("JVM shutdown.") |
|
for key, value in sorted(self.stats.items()): |
|
log.info(f"STATS: {key}: {value}") |
|
|
|
|
|
if __name__ == "__main__": |
|
languages = ["de", "fr", "lb"] |
|
parser = argparse.ArgumentParser(description="Mallet Topic Inference in Python") |
|
|
|
parser.add_argument("--logfile", help="Path to log file", default=None) |
|
parser.add_argument( |
|
"--level", |
|
default="DEBUG", |
|
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], |
|
help="Logging level $(default)s", |
|
) |
|
parser.add_argument( |
|
"--input", help="Path to input file (%(default)s)", required=True |
|
) |
|
parser.add_argument( |
|
"--input-format", |
|
choices=["jsonl", "csv"], |
|
default="jsonl", |
|
help="Format of the input file", |
|
) |
|
parser.add_argument( |
|
"--output-format", |
|
choices=["jsonl", "csv"], |
|
help=( |
|
"Format of the output file: csv: raw Mallet output with docids patched into" |
|
" numericID-LANG, jsonl: impresso JSONL format" |
|
), |
|
) |
|
parser.add_argument( |
|
"--output", |
|
help="Path to final output file. (%(default)s)", |
|
default="out.jsonl", |
|
) |
|
parser.add_argument( |
|
"--languages", |
|
nargs="+", |
|
default=languages, |
|
help="List of languages to support (%(default)s)", |
|
) |
|
|
|
parser.add_argument( |
|
"--language-file", |
|
help="Path to JSONL containing document_id to language mappings", |
|
required=False, |
|
) |
|
parser.add_argument("--model_dir", help="Path to model directory", required=True) |
|
|
|
for lang in languages: |
|
parser.add_argument( |
|
f"--{lang}_inferencer", |
|
help=f"Path to {lang} inferencer file", |
|
) |
|
parser.add_argument(f"--{lang}_pipe", help=f"Path to {lang} pipe file") |
|
parser.add_argument( |
|
f"--{lang}_lemmatization", help=f"Path to {lang} lemmatization file" |
|
) |
|
|
|
for lang in languages: |
|
parser.add_argument( |
|
f"--{lang}_model_id", |
|
default=f"tm-{lang}-all-v2.0", |
|
help="Model ID can take a {lang} format placeholder (%(default)s)", |
|
) |
|
for lang in languages: |
|
parser.add_argument( |
|
f"--{lang}_topic_count", |
|
default=100, |
|
help="Number of topics of model (%(default)s)", |
|
) |
|
args = parser.parse_args() |
|
|
|
logging.basicConfig( |
|
filename=args.logfile, |
|
level=args.level, |
|
format="%(asctime)-15s %(filename)s:%(lineno)d %(levelname)s: %(message)s", |
|
force=True, |
|
) |
|
|
|
for lang in args.languages: |
|
model_id = getattr(args, f"{lang}_model_id") |
|
model_dir = args.model_dir |
|
|
|
pipe_path = os.path.join(model_dir, f"{model_id}.pipe") |
|
inferencer_path = os.path.join(model_dir, f"{model_id}.inferencer") |
|
lemmatization_path = os.path.join( |
|
model_dir, f"{model_id}.vocab.lemmatization.tsv.gz" |
|
) |
|
|
|
if not getattr(args, f"{lang}_pipe") and os.path.exists(pipe_path): |
|
logging.info("Automatically setting pipe path to %s", pipe_path) |
|
setattr(args, f"{lang}_pipe", pipe_path) |
|
if not getattr(args, f"{lang}_inferencer") and os.path.exists(inferencer_path): |
|
logging.info("Automatically setting inferencer path to %s", inferencer_path) |
|
setattr(args, f"{lang}_inferencer", inferencer_path) |
|
if not getattr(args, f"{lang}_lemmatization") and os.path.exists( |
|
lemmatization_path |
|
): |
|
logging.info( |
|
"Automatically setting lemmatization path to %s", lemmatization_path |
|
) |
|
setattr(args, f"{lang}_lemmatization", lemmatization_path) |
|
|
|
if not args.output_format: |
|
if "jsonl" in args.output: |
|
args.output_format = "jsonl" |
|
else: |
|
args.output_format = "csv" |
|
logging.warning("Unspecified output format set to %s", args.output_format) |
|
for lang in args.languages: |
|
if not getattr(args, f"{lang}_inferencer") or not getattr(args, f"{lang}_pipe"): |
|
logging.warning( |
|
"Inferencer or pipe file not provided for language: %s. Ignoring" |
|
" content items for this language.", |
|
lang, |
|
) |
|
args.languages.remove(lang) |
|
logging.info( |
|
"Performing monolingual topic inference for the following languages: %s", |
|
args.languages, |
|
) |
|
|
|
logging.info("Arguments: %s", args) |
|
app = MalletTopicInferencer(args) |
|
app.run() |
|
|