package edu.emory.mathcs.nlp.component.tokenizer;

import edu.emory.mathcs.nlp.common.util.Language;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Abbreviation;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Compound;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.EnglishApostrophe;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.EnglishHyphen;
import edu.emory.mathcs.nlp.component.tokenizer.token.Token;
import edu.emory.mathcs.nlp.component.tokenizer.token.TokenIndex;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.math3.geometry.VectorFormat;

/* loaded from: input_file:edu/emory/mathcs/nlp/component/tokenizer/EnglishTokenizer.class */
public class EnglishTokenizer extends Tokenizer {
    private final String[] L_BRACKETS = {"\"", "(", VectorFormat.DEFAULT_PREFIX, "["};
    private final String[] R_BRACKETS = {"\"", ")", VectorFormat.DEFAULT_SUFFIX, "]"};
    private final Pattern P_MID_SYM = PatternUtils.createClosedPattern("(\\p{Alpha}{2,}+)([\\.\\!\\?]+)(\\p{Alpha}{2,}+)");
    private EnglishApostrophe d_apostrophe = new EnglishApostrophe();
    private Abbreviation d_abbreviation = new Abbreviation();
    private Compound d_compound = new Compound(Language.ENGLISH);
    private EnglishHyphen d_hyphen = new EnglishHyphen();

    @Override // edu.emory.mathcs.nlp.component.tokenizer.Tokenizer
    protected int adjustFirstNonSymbolGap(char[] cArr, int i, String str) {
        return 0;
    }

    @Override // edu.emory.mathcs.nlp.component.tokenizer.Tokenizer
    protected int adjustLastSymbolSequenceGap(char[] cArr, int i, String str) {
        return (cArr[i] == '.' && this.d_abbreviation.isAbbreviationEndingWithPeriod(StringUtils.toLowerCase(str))) ? 1 : 0;
    }

    @Override // edu.emory.mathcs.nlp.component.tokenizer.Tokenizer
    protected boolean preserveSymbolInBetween(char[] cArr, int i) {
        return this.d_hyphen.preserveHyphen(cArr, i);
    }

    @Override // edu.emory.mathcs.nlp.component.tokenizer.Tokenizer
    protected boolean tokenizeWordsMore(List<Token> list, String str, String str2, char[] cArr, TokenIndex tokenIndex) {
        return tokenize(list, str, str2, cArr, this.d_apostrophe, tokenIndex) || tokenize(list, str, str2, cArr, this.d_compound, tokenIndex);
    }

    @Override // edu.emory.mathcs.nlp.component.tokenizer.Tokenizer
    protected int tokenizeMiddleSymbol(List<Token> list, String str, String str2, int i) {
        Matcher matcher = this.P_MID_SYM.matcher(str);
        if (matcher.find()) {
            return addTokens(matcher, list, i, 2, 3, 4);
        }
        return 0;
    }

    @Override // edu.emory.mathcs.nlp.component.tokenizer.Tokenizer
    public <T extends Token> List<List<T>> segmentize(List<T> list) {
        ArrayList arrayList = new ArrayList();
        int[] iArr = new int[this.R_BRACKETS.length];
        int size = list.size();
        boolean z = false;
        int i = 0;
        for (int i2 = 0; i2 < size; i2++) {
            String wordForm = list.get(i2).getWordForm();
            countBrackets(wordForm, iArr);
            if (z || isFinalMarksOnly(wordForm)) {
                if (i2 + 1 >= size || !isFollowedByBracket(list.get(i2 + 1).getWordForm(), iArr)) {
                    int i3 = i;
                    int i4 = i2 + 1;
                    i = i4;
                    arrayList.add(list.subList(i3, i4));
                    z = false;
                } else {
                    z = true;
                }
            }
        }
        if (i < size) {
            arrayList.add(list.subList(i, size));
        }
        return arrayList;
    }

    public Token[] getSubArray(List<Token> list, int i, int i2) {
        Token[] tokenArr = new Token[i2 - i];
        int i3 = i;
        int i4 = 0;
        while (i3 < i2) {
            tokenArr[i4] = list.get(i3);
            i3++;
            i4++;
        }
        return tokenArr;
    }

    private void countBrackets(String str, int[] iArr) {
        if (str.equals("\"")) {
            iArr[0] = iArr[0] + (iArr[0] == 0 ? 1 : -1);
            return;
        }
        int length = iArr.length;
        for (int i = 1; i < length; i++) {
            if (str.equals(this.L_BRACKETS[i])) {
                int i2 = i;
                iArr[i2] = iArr[i2] + 1;
            } else if (str.equals(this.R_BRACKETS[i])) {
                int i3 = i;
                iArr[i3] = iArr[i3] - 1;
            }
        }
    }

    private boolean isFollowedByBracket(String str, int[] iArr) {
        int length = this.R_BRACKETS.length;
        for (int i = 0; i < length; i++) {
            if (iArr[i] > 0 && str.equals(this.R_BRACKETS[i])) {
                return true;
            }
        }
        return false;
    }
}
