package marmot.tokenize.openlp;

import edu.emory.mathcs.nlp.common.constant.StringConst;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.regex.Pattern;
import marmot.tokenize.Tokenizer;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

/* loaded from: input_file:marmot/tokenize/openlp/OpenNlpTokenizerTrainer.class */
public class OpenNlpTokenizerTrainer {
    public static final int CUTOFF = 1;

    public Tokenizer train(String str) throws IOException {
        TokenSampleStream tokenSampleStream = new TokenSampleStream(new PlainTextByLineStream(new FileInputStream(str), Charset.forName("UTF-8")));
        TokenizerFactory tokenizerFactory = new TokenizerFactory(StringConst.EMPTY, (Dictionary) null, true, (Pattern) null);
        TrainingParameters defaultParams = TrainingParameters.defaultParams();
        defaultParams.put("Cutoff", Integer.toString(1));
        try {
            TokenizerModel train = TokenizerME.train(tokenSampleStream, tokenizerFactory, defaultParams);
            tokenSampleStream.close();
            return new OpenNlpTokenizer(train);
        } catch (Throwable th) {
            tokenSampleStream.close();
            throw th;
        }
    }
}
