package marmot.tokenize.cmd;

import chipmunk.segmenter.SegmenterOptions;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import lemming.lemma.LemmaOptions;
import marmot.tokenize.RuleBasedTokenizer;
import marmot.tokenize.Tokenizer;
import marmot.tokenize.openlp.OpenNlpConverter;
import marmot.tokenize.openlp.OpenNlpTokenizerTrainer;
import marmot.tokenize.preprocess.Pair;
import marmot.tokenize.preprocess.WikiSelector;
import marmot.tokenize.rules.RuleProvider;
import marmot.util.FileUtils;
import marmot.util.GeneralLevenshteinLattice;
import marmot.util.LevenshteinLattice;

/* loaded from: input_file:marmot/tokenize/cmd/Experimenter.class */
public class Experimenter {
    public static void main(String[] strArr) throws IOException, JSAPException {
        JSAP jsap = new JSAP();
        jsap.registerParameter(new FlaggedOption("tokenized-file").setRequired(true).setLongFlag("tokenized-file"));
        jsap.registerParameter(new FlaggedOption("untokenized-file").setRequired(true).setLongFlag("untokenized-file"));
        jsap.registerParameter(new FlaggedOption(SegmenterOptions.LANG).setRequired(true).setLongFlag(SegmenterOptions.LANG));
        jsap.registerParameter(new FlaggedOption("num-sentences").setLongFlag("num-sentences").setStringParser(JSAP.INTEGER_PARSER).setDefault("1000"));
        jsap.registerParameter(new FlaggedOption("random-seed").setLongFlag("random-seed").setStringParser(JSAP.INTEGER_PARSER).setDefault("42"));
        jsap.registerParameter(new FlaggedOption(LemmaOptions.VERBOSITY).setLongFlag(LemmaOptions.VERBOSITY).setStringParser(JSAP.INTEGER_PARSER).setDefault("0"));
        JSAPResult parse = jsap.parse(strArr);
        if (!parse.success()) {
            Iterator errorMessageIterator = parse.getErrorMessageIterator();
            while (errorMessageIterator.hasNext()) {
                System.err.println("Error: " + errorMessageIterator.next());
            }
            System.err.println("Usage: ");
            System.err.println(jsap.getUsage());
            System.err.println(jsap.getHelp());
            System.err.println();
            System.exit(1);
        }
        String string = parse.getString(SegmenterOptions.LANG);
        String string2 = parse.getString("tokenized-file");
        String string3 = parse.getString("untokenized-file");
        int i = parse.getInt("num-sentences");
        int i2 = parse.getInt(LemmaOptions.VERBOSITY);
        Random random = new Random(parse.getInt("random-seed"));
        boolean z = string.equalsIgnoreCase("de") || string.equalsIgnoreCase("es");
        LinkedList<Pair> linkedList = new LinkedList();
        Iterator<Pair> it2 = new WikiSelector(string3, string2, z, i).iterator();
        while (it2.hasNext()) {
            linkedList.add(it2.next());
        }
        Collections.shuffle(linkedList, random);
        LinkedList linkedList2 = new LinkedList();
        LinkedList linkedList3 = new LinkedList();
        LinkedList linkedList4 = new LinkedList();
        int i3 = 0;
        for (Pair pair : linkedList) {
            if (i3 == 0) {
                linkedList3.add(pair);
            } else if (i3 == 1) {
                linkedList4.add(pair);
            } else {
                linkedList2.add(pair);
            }
            i3 = (i3 + 1) % 10;
        }
        runExperiment(linkedList2, linkedList3, linkedList4, 1.0d, i2, string);
        runExperiment(linkedList2, linkedList3, linkedList4, 10.0d, i2, string);
        runExperiment(linkedList2, linkedList3, linkedList4, 100.0d, i2, string);
    }

    public static void runExperiment(List<Pair> list, List<Pair> list2, List<Pair> list3, double d, int i, String str) throws IOException {
        int size = (int) ((d * list.size()) / 100.0d);
        System.err.format("Trnsize: %d\n", Integer.valueOf(size));
        List<Pair> subList = list.subList(0, size);
        RuleProvider createRuleProvider = RuleProvider.createRuleProvider(str);
        OpenNlpConverter openNlpConverter = new OpenNlpConverter(createRuleProvider);
        File createTempFile = File.createTempFile("openlp_file", ".txt");
        createTempFile.deleteOnExit();
        Writer openFileWriter = FileUtils.openFileWriter(createTempFile.getAbsolutePath());
        openNlpConverter.convert(subList, openFileWriter, i);
        openFileWriter.close();
        runEvaluation(str, new RuleBasedTokenizer(new OpenNlpTokenizerTrainer().train(createTempFile.getAbsolutePath()), createRuleProvider), list2);
    }

    public static void runEvaluation(String str, Tokenizer tokenizer, List<Pair> list) {
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        int i5 = 0;
        int i6 = 0;
        for (Pair pair : list) {
            List<String> list2 = tokenizer.tokenize(pair.untokenized);
            String str2 = pair.tokenized;
            if (str.equals("es")) {
                str2 = str2.replace('_', ' ');
            }
            List asList = Arrays.asList(str2.split("\\s+"));
            StringBuilder sb = new StringBuilder();
            for (String str3 : list2) {
                if (sb.length() > 0) {
                    sb.append(' ');
                }
                sb.append(str3);
            }
            String sb2 = sb.toString();
            if (!list2.equals(asList)) {
                i++;
            }
            i4++;
            i2 += new LevenshteinLattice(str2, sb2).getDistance();
            i5 += str2.length();
            i3 += new GeneralLevenshteinLattice(asList, list2).getDistance();
            i6 += asList.size();
        }
        System.err.format("Sent Err: %d / %d = %g\n", Integer.valueOf(i), Integer.valueOf(i4), Double.valueOf((i * 100.0d) / i4));
        System.err.format("Word Err: %d / %d = %g\n", Integer.valueOf(i3), Integer.valueOf(i6), Double.valueOf((i3 * 100.0d) / i6));
        System.err.format("Char Err: %d / %d = %g\n", Integer.valueOf(i2), Integer.valueOf(i5), Double.valueOf((i2 * 100.0d) / i5));
    }
}
