package edu.emory.mathcs.nlp.component.tokenizer;

import edu.emory.mathcs.nlp.common.constant.StringConst;
import edu.emory.mathcs.nlp.common.util.CharUtils;
import edu.emory.mathcs.nlp.common.util.IOUtils;
import edu.emory.mathcs.nlp.common.util.Joiner;
import edu.emory.mathcs.nlp.common.util.MetaUtils;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Currency;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Dictionary;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Emoticon;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Unit;
import edu.emory.mathcs.nlp.component.tokenizer.token.Token;
import edu.emory.mathcs.nlp.component.tokenizer.token.TokenIndex;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.magicwerk.brownies.collections.GapList;

/* loaded from: input_file:edu/emory/mathcs/nlp/component/tokenizer/Tokenizer.class */
public abstract class Tokenizer {
    protected final CharSet S_SYMBOL_IN_BETWEEN = new CharOpenHashSet(new char[]{';', ',', '~', '=', '+', '&', '|', '/'});
    protected final Pattern P_ABBREVIATION = PatternUtils.createClosedPattern("\\p{Alnum}([\\.|-]\\p{Alnum})*");
    protected final Pattern P_YEAR = PatternUtils.createClosedPattern("\\d\\d['’]?[sS]?");
    protected final Pattern P_YEAR_YEAR = PatternUtils.createClosedPattern("(\\d{2}|\\d{4})(-)(\\d{2}|\\d{4}|\\d{2}[sS])");
    protected Emoticon d_emoticon = new Emoticon();
    protected Currency d_currency = new Currency();
    protected Unit d_unit = new Unit();
    protected Set<String> d_preserve = initPreserve();

    private Set<String> initPreserve() {
        BufferedReader createBufferedReader = IOUtils.createBufferedReader(IOUtils.getInputStreamsFromResource(Dictionary.ROOT + "preserve.txt"));
        HashSet hashSet = new HashSet();
        while (true) {
            try {
                String readLine = createBufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                hashSet.add(readLine.trim());
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return hashSet;
    }

    public List<Token> tokenize(InputStream inputStream) {
        int i;
        BufferedReader createBufferedReader = IOUtils.createBufferedReader(inputStream);
        ArrayList arrayList = new ArrayList();
        int i2 = 0;
        boolean z = false;
        while (true) {
            try {
                String readLine = createBufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                if (z) {
                    i = i2 + System.getProperty("line.separator").length();
                    i2 = i + readLine.length();
                } else {
                    i = 0;
                    i2 = readLine.length();
                    z = true;
                }
                List<Token> list = tokenizeWhiteSpaces(readLine, i);
                if (!list.isEmpty()) {
                    arrayList.addAll(list);
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        createBufferedReader.close();
        arrayList.trimToSize();
        return arrayList;
    }

    public List<Token> tokenize(String str) {
        return tokenizeWhiteSpaces(str, 0);
    }

    public List<List<Token>> segmentize(InputStream inputStream) {
        return segmentize(tokenize(inputStream));
    }

    public List<List<Token>> segmentize(String str) {
        return segmentize(tokenize(str));
    }

    public void tokenizeLine(InputStream inputStream, PrintStream printStream, String str, int i) {
        BufferedReader createBufferedReader = IOUtils.createBufferedReader(inputStream);
        while (true) {
            try {
                String readLine = createBufferedReader.readLine();
                if (readLine == null) {
                    return;
                }
                List<Token> list = tokenize(readLine);
                if (i > 0) {
                    for (Token token : list) {
                        String wordForm = token.getWordForm();
                        switch (i) {
                            case 1:
                                wordForm = StringUtils.toSimplifiedForm(wordForm, false);
                                break;
                            case 2:
                                wordForm = StringUtils.toSimplifiedForm(wordForm, true);
                                break;
                        }
                        token.setWordForm(wordForm);
                    }
                }
                printStream.println(Joiner.join(list, str));
            } catch (IOException e) {
                e.printStackTrace();
                return;
            }
        }
    }

    public abstract <T extends Token> List<List<T>> segmentize(List<T> list);

    private List<Token> tokenizeWhiteSpaces(String str, int i) {
        GapList gapList = new GapList();
        int length = str.length();
        int i2 = i;
        char[] charArray = str.toCharArray();
        for (int i3 = i; i3 < i + length; i3++) {
            if (CharUtils.isWhiteSpace(charArray[i3 - i])) {
                if (i2 < i3) {
                    tokenizeMetaInfo(gapList, str.substring(i2 - i, i3 - i), i2, i3);
                }
                i2 = i3 + 1;
            }
        }
        if (i2 < i + length) {
            tokenizeMetaInfo(gapList, str.substring(i2 - i), i2, (length - i2) + i);
        }
        if (!gapList.isEmpty()) {
            finalize(gapList, str);
        }
        return gapList;
    }

    private void tokenizeMetaInfo(List<Token> list, String str, int i, int i2) {
        TokenIndex tokenIndex = new TokenIndex(i);
        int[] metaRange = getMetaRange(str);
        if (metaRange == null) {
            tokenizeSymbols(list, str, tokenIndex);
            return;
        }
        int i3 = metaRange[0];
        int i4 = metaRange[1];
        int length = str.length();
        if (0 < i3) {
            tokenizeSymbols(list, str.substring(0, i3), tokenIndex);
        }
        list.add(new Token(str.substring(i3, i4), tokenIndex.getVal(), (tokenIndex.getVal() + i4) - i3));
        tokenIndex.setVal((tokenIndex.getVal() + i4) - i3);
        if (i4 < length) {
            tokenizeSymbols(list, str.substring(i4), tokenIndex);
        }
    }

    private int[] getMetaRange(String str) {
        if (MetaUtils.startsWithNetworkProtocol(str) || this.d_preserve.contains(str)) {
            return new int[]{0, str.length()};
        }
        int[] emoticonRange = this.d_emoticon.getEmoticonRange(str);
        if (emoticonRange != null) {
            return emoticonRange;
        }
        Matcher matcher = MetaUtils.HYPERLINK.matcher(str);
        if (matcher.find()) {
            return new int[]{matcher.start(), matcher.end()};
        }
        return null;
    }

    private void tokenizeSymbols(List<Token> list, String str, TokenIndex tokenIndex) {
        char[] charArray = str.toCharArray();
        int length = str.length();
        int firstNonSymbolIndex = getFirstNonSymbolIndex(charArray);
        if (firstNonSymbolIndex == length) {
            addSymbols(list, str, tokenIndex);
            return;
        }
        int lastSymbolSequenceIndex = getLastSymbolSequenceIndex(charArray);
        ArrayList arrayList = new ArrayList();
        arrayList.add(new int[]{0, firstNonSymbolIndex});
        addNextSymbolSequenceIndices(arrayList, charArray, firstNonSymbolIndex + 1, lastSymbolSequenceIndex - 1);
        arrayList.add(new int[]{lastSymbolSequenceIndex, length});
        tokenizeSymbolsAux(list, str, charArray, arrayList, tokenIndex);
    }

    private int getFirstNonSymbolIndex(char[] cArr) {
        int length = cArr.length;
        int i = 0;
        while (i < length && isSymbol(cArr[i])) {
            i++;
        }
        return i;
    }

    private int getLastSymbolSequenceIndex(char[] cArr) {
        int length = cArr.length - 1;
        while (length >= 0 && isSymbol(cArr[length])) {
            length--;
        }
        return length + 1;
    }

    private void addNextSymbolSequenceIndices(List<int[]> list, char[] cArr, int i, int i2) {
        int i3 = i;
        while (i3 < i2) {
            if (!preserveSymbolInBetween(cArr, i3) && !preserveSymbolInDigits(cArr, i3) && !preserveSymbolInAlphabets(cArr, i3) && (isEllipsis(cArr, i3) || isSymbolInBetween(cArr[i3]) || (i3 + 1 < i2 && isSymbolInBetween(cArr[i3 + 1]) && CharUtils.isFinalMark(cArr[i3])))) {
                int spanIndex = getSpanIndex(cArr, i3, i2, false);
                list.add(new int[]{i3, spanIndex});
                i3 = spanIndex - 1;
            }
            i3++;
        }
    }

    private void tokenizeSymbolsAux(List<Token> list, String str, char[] cArr, List<int[]> list2, TokenIndex tokenIndex) {
        int size = list2.size() - 1;
        int i = 0;
        while (i < size) {
            int[] iArr = list2.get(i);
            int[] iArr2 = list2.get(i + 1);
            int i2 = iArr[1];
            int i3 = iArr2[0];
            if (i2 < i3) {
                String substring = str.substring(i2, i3);
                int i4 = iArr[1] - iArr[0];
                int i5 = iArr2[1] - iArr2[0];
                boolean z = i == 0 ? i4 > 0 : i4 == 1;
                boolean z2 = i + 1 == size ? i5 > 0 : i5 == 1;
                if (z) {
                    iArr[1] = adjustFirstNonSymbolIndex(cArr, i2, substring);
                }
                if (z2) {
                    iArr2[0] = adjustLastSymbolSequenceIndex(cArr, i3, substring);
                }
            }
            i++;
        }
        for (int i6 = 0; i6 < size; i6++) {
            int[] iArr3 = list2.get(i6);
            int[] iArr4 = list2.get(i6 + 1);
            int i7 = iArr3[0];
            int i8 = iArr3[1];
            if (i7 < i8) {
                String substring2 = str.substring(i7, i8);
                if (i6 == 0) {
                    tokenIndex.setVal(addSymbols(list, substring2, tokenIndex));
                } else {
                    list.add(new Token(substring2, tokenIndex.getVal(), tokenIndex.getVal() + substring2.length()));
                    tokenIndex.setVal(tokenIndex.getVal() + substring2.length());
                }
            }
            int i9 = iArr3[1];
            int i10 = iArr4[0];
            if (i9 < i10) {
                tokenIndex.setVal(addMorphemes(list, str.substring(i9, i10), tokenIndex));
            }
        }
        int[] iArr5 = list2.get(size);
        int i11 = iArr5[0];
        int i12 = iArr5[1];
        if (i11 < i12) {
            tokenIndex.setVal(addSymbols(list, str.substring(i11, i12), tokenIndex));
        }
    }

    private int adjustFirstNonSymbolIndex(char[] cArr, int i, String str) {
        char c = cArr[i - 1];
        char c2 = cArr[i];
        int adjustFirstNonSymbolGap = adjustFirstNonSymbolGap(cArr, i, str);
        if (adjustFirstNonSymbolGap > 0) {
            i -= adjustFirstNonSymbolGap;
        } else if (CharUtils.isPreDigitSymbol(c)) {
            if (CharUtils.isDigit(c2)) {
                i--;
            }
        } else if (c == '@' || c == '#') {
            if (CharUtils.isAlphabet(c2)) {
                i--;
            }
        } else if (CharUtils.isApostrophe(c) && this.P_YEAR.matcher(str).find()) {
            i--;
        }
        return i;
    }

    protected int adjustLastSymbolSequenceIndex(char[] cArr, int i, String str) {
        String lowerCase = StringUtils.toLowerCase(str);
        char c = cArr[i];
        int adjustLastSymbolSequenceGap = adjustLastSymbolSequenceGap(cArr, i, str);
        if (adjustLastSymbolSequenceGap > 0) {
            i += adjustLastSymbolSequenceGap;
        } else if (c == '$') {
            if (this.d_currency.isCurrencyDollar(lowerCase)) {
                i++;
            }
        } else if (c == '.' && preservePeriod(cArr, i, str)) {
            i++;
        }
        return i;
    }

    protected abstract int adjustFirstNonSymbolGap(char[] cArr, int i, String str);

    protected abstract int adjustLastSymbolSequenceGap(char[] cArr, int i, String str);

    private int addSymbols(List<Token> list, String str, TokenIndex tokenIndex) {
        if (str.length() == 1) {
            list.add(new Token(str, tokenIndex.getVal(), tokenIndex.getVal() + 1));
            tokenIndex.setVal(tokenIndex.getVal() + 1);
            return tokenIndex.getVal();
        }
        int length = str.length();
        int i = 0;
        char[] charArray = str.toCharArray();
        int i2 = 0;
        while (true) {
            int i3 = i2;
            if (i3 >= length) {
                break;
            }
            int symbolFlag = getSymbolFlag(charArray[i3]);
            int spanIndex = getSpanIndex(charArray, i3, length, symbolFlag == 1);
            if (0 < symbolFlag || i3 + 1 < spanIndex) {
                if (i < i3) {
                    list.add(new Token(str.substring(i, i3), tokenIndex.getVal(), (tokenIndex.getVal() + i3) - i));
                    tokenIndex.setVal((tokenIndex.getVal() + i3) - i);
                }
                list.add(new Token(str.substring(i3, spanIndex), tokenIndex.getVal(), (tokenIndex.getVal() + spanIndex) - i3));
                tokenIndex.setVal((tokenIndex.getVal() + spanIndex) - i3);
                i = spanIndex;
            }
            i2 = spanIndex;
        }
        if (i < length) {
            list.add(new Token(str.substring(i), tokenIndex.getVal(), (tokenIndex.getVal() + length) - i));
            tokenIndex.setVal((tokenIndex.getVal() + length) - i);
        }
        return tokenIndex.getVal();
    }

    private int getSpanIndex(char[] cArr, int i, int i2, boolean z) {
        char c = cArr[i];
        int i3 = i + 1;
        while (i3 < i2 && isConsecutive(cArr, i3, c, z)) {
            i3++;
        }
        return i3;
    }

    private int addMorphemes(List<Token> list, String str, TokenIndex tokenIndex) {
        if (str.length() == 1) {
            list.add(new Token(str, tokenIndex.getVal(), tokenIndex.getVal() + 1));
            tokenIndex.setVal(tokenIndex.getVal() + 1);
            return tokenIndex.getVal();
        }
        char[] charArray = str.toCharArray();
        String str2 = CharUtils.toLowerCase(charArray) ? new String(charArray) : str;
        if (tokenize(list, str, str2, charArray, this.d_currency, tokenIndex) || tokenize(list, str, str2, charArray, this.d_unit, tokenIndex) || tokenizeDigit(list, str, charArray, tokenIndex) || tokenizeWordsMore(list, str, str2, charArray, tokenIndex)) {
            return tokenIndex.getVal();
        }
        list.add(new Token(str, tokenIndex.getVal(), tokenIndex.getVal() + str.length()));
        tokenIndex.setVal(tokenIndex.getVal() + str.length());
        return tokenIndex.getVal();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean tokenize(List<Token> list, String str, String str2, char[] cArr, Dictionary dictionary, TokenIndex tokenIndex) {
        String[] strArr = dictionary.tokenize(str, str2, cArr);
        if (strArr == null) {
            return false;
        }
        tokenIndex.setVal(addAll(list, strArr, tokenIndex.getVal()));
        return true;
    }

    public int addAll(List<Token> list, String[] strArr, int i) {
        for (String str : strArr) {
            list.add(new Token(str, i, i + str.length()));
            i += str.length();
        }
        return i;
    }

    private boolean tokenizeDigit(List<Token> list, String str, char[] cArr, TokenIndex tokenIndex) {
        int length = cArr.length;
        if (length < 2) {
            return false;
        }
        if (tokenizeDigitAux(cArr[0]) && CharUtils.containsDigitPunctuationOnly(cArr, 1, length)) {
            list.add(new Token(str.substring(0, 1), tokenIndex.getVal(), tokenIndex.getVal() + 1));
            tokenIndex.setVal(tokenIndex.getVal() + 1);
            list.add(new Token(str.substring(1), tokenIndex.getVal(), (tokenIndex.getVal() + str.length()) - 1));
            tokenIndex.setVal((tokenIndex.getVal() + str.length()) - 1);
            return true;
        }
        int i = length - 1;
        if (!tokenizeDigitAux(cArr[i]) || !CharUtils.containsDigitPunctuationOnly(cArr, 0, i)) {
            return false;
        }
        list.add(new Token(str.substring(0, i), tokenIndex.getVal(), tokenIndex.getVal() + i));
        tokenIndex.setVal(tokenIndex.getVal() + i);
        list.add(new Token(str.substring(i), tokenIndex.getVal(), (tokenIndex.getVal() + str.length()) - i));
        tokenIndex.setVal((tokenIndex.getVal() + str.length()) - i);
        return true;
    }

    private boolean tokenizeDigitAux(char c) {
        return c == '#' || c == '$' || c == '%' || c == '*' || c == '=';
    }

    protected abstract boolean tokenizeWordsMore(List<Token> list, String str, String str2, char[] cArr, TokenIndex tokenIndex);

    private void finalize(List<Token> list, String str) {
        int size = list.size();
        int i = 0;
        while (i < size) {
            String wordForm = list.get(i).getWordForm();
            String lowerCase = StringUtils.toLowerCase(wordForm);
            int i2 = tokenizeNo(list, wordForm, lowerCase, i);
            int i3 = i2;
            if (i2 == 0 && mergeBrackets(list, wordForm, i, str) == 0) {
                int i4 = tokenizeYears(list, wordForm, i);
                i3 = i4;
                if (i4 == 0) {
                    int i5 = tokenizeMiddleSymbol(list, wordForm, lowerCase, i);
                    i3 = i5;
                    if (i5 == 0) {
                        i++;
                    }
                }
            }
            size = list.size();
            i += i3;
            i++;
        }
        if (list.size() == 1) {
            tokenizeLastPeriod(list);
        }
    }

    private int tokenizeNo(List<Token> list, String str, String str2, int i) {
        if (!str2.equals("no.")) {
            return 0;
        }
        if (i + 1 != list.size() && CharUtils.isDigit(list.get(i + 1).getWordForm().charAt(0))) {
            return 0;
        }
        Token token = list.get(i);
        list.set(i, new Token(StringUtils.trim(token.getWordForm(), 1), token.getStartOffset(), token.getEndOffset() - 1));
        list.add(i + 1, new Token(StringConst.PERIOD, token.getEndOffset() - 1, token.getEndOffset()));
        return 1;
    }

    private int mergeBrackets(List<Token> list, String str, int i, String str2) {
        if ((str.length() != 1 && !StringUtils.containsDigitOnly(str)) || 0 > i - 1 || i + 1 >= list.size()) {
            return 0;
        }
        Token token = list.get(i - 1);
        Token token2 = list.get(i + 1);
        if (!CharUtils.isLeftBracket(token.getWordForm().charAt(0)) || !CharUtils.isRightgBracket(token2.getWordForm().charAt(0))) {
            return 0;
        }
        list.set(i - 1, new Token(token.getWordForm() + list.get(i).getWordForm() + token2.getWordForm(), token.getStartOffset(), token2.getEndOffset()));
        list.remove(i);
        list.remove(i);
        return -1;
    }

    private int tokenizeYears(List<Token> list, String str, int i) {
        Matcher matcher = this.P_YEAR_YEAR.matcher(str);
        if (matcher.find()) {
            return addTokens(matcher, list, i, 2, 3, 4);
        }
        return 0;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public int addTokens(Matcher matcher, List<Token> list, int i, int... iArr) {
        Token token = list.get(i);
        token.setWordForm(matcher.group(iArr[0]));
        token.resetEndOffset();
        for (int i2 = 1; i2 < iArr.length; i2++) {
            Token token2 = token;
            token = new Token(matcher.group(iArr[i2]));
            token.setStartOffset(token2.getEndOffset());
            token.resetEndOffset();
            list.add(i + i2, token);
        }
        return iArr.length - 1;
    }

    protected abstract int tokenizeMiddleSymbol(List<Token> list, String str, String str2, int i);

    private void tokenizeLastPeriod(List<Token> list) {
        int size = list.size() - 1;
        Token token = list.get(size);
        String wordForm = token.getWordForm();
        char[] charArray = wordForm.toCharArray();
        int length = wordForm.length();
        if (1 >= length || charArray[length - 1] != '.' || CharUtils.isFinalMark(charArray[length - 2])) {
            return;
        }
        list.set(size, new Token(StringUtils.trim(wordForm, 1), token.getStartOffset(), token.getEndOffset() - 1));
        list.add(size + 1, new Token(StringConst.PERIOD, token.getEndOffset() - 1, token.getEndOffset()));
    }

    protected abstract boolean preserveSymbolInBetween(char[] cArr, int i);

    private boolean preserveSymbolInDigits(char[] cArr, int i) {
        char c = cArr[i];
        return CharUtils.isHyphen(c) ? 0 <= i - 1 && i + 1 < cArr.length && CharUtils.isAlnum(cArr[i - 1]) && CharUtils.isDigit(cArr[i + 1]) : c == '/' ? 0 <= i - 1 && i + 1 < cArr.length && CharUtils.isDigit(cArr[i - 1]) && CharUtils.isDigit(cArr[i + 1]) : cArr[i] == ',' && 0 <= i - 1 && i + 3 < cArr.length && (i + 4 == cArr.length || !CharUtils.isDigit(cArr[i + 4])) && CharUtils.isDigit(cArr[i - 1]) && CharUtils.isDigit(cArr[i + 1]) && CharUtils.isDigit(cArr[i + 2]) && CharUtils.isDigit(cArr[i + 3]);
    }

    private boolean preserveSymbolInAlphabets(char[] cArr, int i) {
        return cArr[i] == '&' && 0 <= i - 1 && i + 1 < cArr.length && CharUtils.isAlphabet(cArr[i - 1]) && CharUtils.isAlphabet(cArr[i + 1]);
    }

    protected boolean preservePeriod(char[] cArr, int i, String str) {
        if (i + 1 < cArr.length) {
            char c = cArr[i + 1];
            if (CharUtils.isSeparatorMark(c)) {
                return true;
            }
            if (CharUtils.isFinalMark(c) || CharUtils.isQuotationMark(c)) {
                return false;
            }
        }
        if (this.P_ABBREVIATION.matcher(str).find()) {
            return true;
        }
        int length = str.length();
        return 2 <= length && length <= 5 && CharUtils.containsOnlyConsonants(str);
    }

    private boolean isSymbol(char c) {
        return CharUtils.isPunctuation(c) || CharUtils.isGeneralPunctuation(c) || CharUtils.isCurrency(c) || CharUtils.isArrow(c);
    }

    private boolean isEllipsis(char[] cArr, int i) {
        if (!CharUtils.isFinalMark(cArr[i]) || i + 1 >= cArr.length) {
            return false;
        }
        char c = cArr[i + 1];
        return CharUtils.isFinalMark(c) || CharUtils.isSeparatorMark(c) || CharUtils.isQuotationMark(c);
    }

    private boolean isSymbolInBetween(char c) {
        return CharUtils.isBracket(c) || CharUtils.isArrow(c) || CharUtils.isDoubleQuotationMark(c) || CharUtils.isHyphen(c) || this.S_SYMBOL_IN_BETWEEN.contains(c);
    }

    private boolean isConsecutive(char[] cArr, int i, char c, boolean z) {
        return z ? CharUtils.isFinalMark(cArr[i]) : c == cArr[i];
    }

    private int getSymbolFlag(char c) {
        if (CharUtils.isFinalMark(c)) {
            return 1;
        }
        return (CharUtils.isBracket(c) || CharUtils.isSeparatorMark(c) || CharUtils.isQuotationMark(c) || c == '`') ? 2 : 0;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean isFinalMarksOnly(String str) {
        for (char c : str.toCharArray()) {
            if (!CharUtils.isFinalMark(c)) {
                return false;
            }
        }
        return true;
    }
}
