package edu.stanford.nlp.ie.machinereading.domains.ace.reader;

import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.util.Generics;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/ie/machinereading/domains/ace/reader/RobustTokenizer.class */
public class RobustTokenizer<T extends Word> extends AbstractTokenizer<Word> {
    String buffer;
    private AbbreviationMap mAbbreviations;
    public static final int MAX_MULTI_WORD_SIZE = 20;
    Word[] cachedTokens;
    int cachedPosition;
    public static final String DOT = block("\\.");
    public static final String DOTDOT = block("\\:");
    public static final String APOSTROPHE = block("\\'");
    public static final String SLASH = block("\\/");
    public static final String UNDERSCORE = block("\\_");
    public static final String MINUS = block("\\-");
    public static final String PLUS = block("\\+");
    public static final String COMMA = block("\\,");
    public static final String DOTCOMMA = block("\\;");
    public static final String QUOTES = block(or("\\\"", "\\'\\'", "\\'", "\\`\\`", "\\`"));
    public static final String DOUBLE_QUOTES = block(or("\\\"", "\\'\\'"));
    public static final String LRB = block("\\(");
    public static final String RRB = block("\\)");
    public static final String LCB = block("\\{");
    public static final String RCB = block("\\}");
    public static final String GREATER = block("\\>");
    public static final String LOWER = block("\\<");
    public static final String AMPERSAND = block("\\&");
    public static final String AT = block("\\@");
    public static final String HTTP = block("[hH][tT][tT][pP]\\:\\/\\/");
    public static final String WHITE_SPACE = block("\\s");
    public static final String DIGIT = block("\\d");
    public static final String LETTER = block("[a-zA-Z]");
    public static final String UPPER = block("[A-Z]");
    public static final String SIGN = or(MINUS, PLUS);
    public static final String FULLNUM = block(zeroOrOne(SIGN) + oneOrMore(DIGIT) + zeroOrMore(zeroOrOne(or(DOT, COMMA, SLASH)) + oneOrMore(DIGIT)));
    public static final String DECNUM = block(DOT + oneOrMore(DIGIT));
    public static final String NUM = or(FULLNUM, DECNUM);
    public static final String DATE = block(oneOrMore(DIGIT) + SLASH + oneOrMore(DIGIT) + SLASH + oneOrMore(DIGIT));
    public static final String TIME = block(oneOrMore(DIGIT) + oneOrMore(block(DOTDOT + oneOrMore(DIGIT))));
    public static final String PUNC = or(QUOTES, block(MINUS + oneOrMore(MINUS)), block(DOT + oneOrMore(DOT)));
    public static final String LETTERS = oneOrMore(LETTER);
    public static final String BLOCK = or(NUM, LETTERS);
    public static final String WORD = block(zeroOrOne(APOSTROPHE) + BLOCK + zeroOrMore(block(zeroOrOne(or(UNDERSCORE, MINUS, APOSTROPHE, SLASH, AMPERSAND)) + BLOCK)));
    public static final String ACRONYM = block(oneOrMore(LETTER + DOT));
    public static final String LOOSE_ACRONYM = block(oneOrMore(oneOrMore(LETTER) + DOT) + zeroOrMore(LETTER));
    public static final String PAREN = or(LRB, RRB, LCB, RCB);
    public static final String HTMLCODE = block(AMPERSAND + UPPER + DOTCOMMA);
    public static final String ANY = block("\\S");
    public static final String EMAIL = block(LETTER + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + AT + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + LETTER);
    public static final String DOMAIN_EMAIL = block(LETTER + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + AT + oneOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + zeroOrMore(WHITE_SPACE) + DOT + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us"));
    public static final String URL = block(HTTP + oneOrMore(or(LETTER, DIGIT, DOT, UNDERSCORE, SLASH, AMPERSAND, MINUS, PLUS)));
    public static final String SMALL_URL = block(oneOrMore(oneOrMore(LETTER) + DOT) + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us"));
    public static final String UNDERSCORESEQ = oneOrMore(Expressions.VAR_SELF);
    public static final String LIST_BULLET = block(LRB + LETTER + zeroOrOne(LETTER) + RRB);
    public static final String PHONE_PART = block(LRB + oneOrMore(DIGIT) + RRB);
    public static final String DIGITSEQ = oneOrMore(DIGIT);
    public static final String SGML = "<[^<>]+>";
    public static final String RECOGNISED_PATTERN = block(block(TIME) + "|" + block(DOMAIN_EMAIL) + "|" + block(EMAIL) + "|" + block(URL) + "|" + block(ACRONYM) + "|" + block(DATE) + "|" + block(PHONE_PART) + "|" + block(WORD) + "|" + block(PUNC) + "|" + block(LIST_BULLET) + "|" + block(PAREN) + "|" + block(SGML) + "|" + block(HTMLCODE) + "|" + block(UNDERSCORESEQ) + "|" + block(ANY));
    private static final Pattern wordPattern = Pattern.compile(RECOGNISED_PATTERN);
    private static final Pattern sgmlPattern = Pattern.compile(SGML);
    private static final Pattern slashDatePattern = Pattern.compile(DATE);
    private static final Pattern acronymPattern = Pattern.compile(LOOSE_ACRONYM);
    private static final Pattern urlPattern = Pattern.compile(URL);
    private static final Pattern emailPattern = Pattern.compile(EMAIL);
    private static final Pattern digitSeqPattern = Pattern.compile(DIGITSEQ);

    /* loaded from: input_file:edu/stanford/nlp/ie/machinereading/domains/ace/reader/RobustTokenizer$AbbreviationMap.class */
    public static class AbbreviationMap {
        private Set<String> mAbbrevSet;

        private static List<String> normalizeCase(boolean z, List<String> list) {
            if (!z) {
                return list;
            }
            ArrayList arrayList = new ArrayList();
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                arrayList.add(it.next().toLowerCase());
            }
            return arrayList;
        }

        public AbbreviationMap(boolean z) {
            this.mAbbrevSet = Generics.newHashSet(normalizeCase(z, Arrays.asList("1.", "10.", "11.", "12.", "13.", "14.", "15.", "16.", "17.", "18.", "19.", "2.", "20.", "21.", "22.", "23.", "24.", "25.", "26.", "27.", "28.", "29.", "3.", "30.", "31.", "32.", "33.", "34.", "35.", "36.", "37.", "38.", "39.", "4.", "40.", "41.", "42.", "43.", "44.", "45.", "46.", "47.", "48.", "49.", "5.", "50.", "6.", "7.", "8.", "9.", "A.", "A.C.", "A.D.", "A.D.L.", "A.F.", "A.G.", "A.H.", "A.J.C.", "A.L.", "A.M", "A.M.", "A.P.", "A.T.B.", "AUG.", "Act.", "Adm.", "Ala.", "Ariz.", "Ark.", "Assn.", "Ass'n.", "Ass'n", "Aug.", "B.", "B.A.T", "B.B.", "B.F.", "B.J.", "B.V.", "Bancorp.", "Bhd.", "Blvd.", "Br.", "Brig.", "Bros.", "C.", "C.B.", "C.D.s", "C.J.", "C.O.", "C.R.", "C.W.", "CEO.", "CO.", "CORP.", "COS.", "Cal.", "Calif.", "Capt.", "Cie.", "Cir.", "Cmdr.", "Co.", "Col.", "Colo.", "Comdr.", "Conn.", "Corp.", "Cos.", "D.", "D.B.", "D.C", "D.C.", "D.H.", "D.M.", "D.N.", "D.S.", "D.T", "D.T.", "D.s", "Dec.", "Del.", "Dept.", "Dev.", "Dr.", "Ds.", "E.", "E.E.", "E.F.", "E.I.", "E.M.", "E.R.", "E.W.", "Etc.", "F.", "F.A.", "F.A.O.", "F.C", "F.E.", "F.J.", "F.S.B.", "F.W.", "FEB.", "FL.", "Feb.", "Fed.", "Fla.", "Fran.", "French.", "Freon.", "Ft.", "G.", "G.D.", "G.L.", "G.O.", "G.S.", "G.m.b", "G.m.b.H.", "GP.", "GPO.", "Ga.", "Gen.", "Gov.", "H.", "H.F.", "H.G.", "H.H.", "H.J.", "H.L.", "H.R.", "Hon.", "I.", "I.B.M.", "I.C.H.", "I.E.P.", "I.M.", "I.V.", "I.W.", "II.", "III.", "INC.", "Intl.", "Int'l", "IV.", "IX.", "Ill.", "Inc.", "Ind.", "J.", "J.C.", "J.D.", "J.E.", "J.F.", "J.F.K.", "J.H.", "J.L.", "J.M.", "JohnQ.Public", "J.P.", "J.R.", "J.V", "J.V.", "J.X.", "Jan.", "Jansz.", "Je.", "Jos.", "Jr.", "K.", "K.C.", "Kan.", "Ky.", "L.", "L.A.", "L.H.", "L.J.", "L.L.", "L.M.", "L.P", "L.P.", "La.", "Lt.", "Ltd.", "M.", "M.A.", "M.B.A.", "M.D", "M.D.", "M.D.C.", "M.E.", "M.J.", "M.R.", "M.S.", "M.W.", "M8.7sp", "Maj.", "Mar.", "Mass.", "Md.", "Med.", "Messrs.", "Mfg.", "Mich.", "Minn.", "Mir.", "Miss.", "Mo.", "Mr.", "Mrs.", "Ms.", "Mt.", "N.", "N.A.", "N.C", "N.C.", "N.D", "N.D.", "N.H", "N.H.", "N.J", "N.J.", "N.M", "N.M.", "N.V", "N.V.", "N.Y", "N.Y.", "NOV.", "Neb.", "Nev.", "No.", "no.", "Nos.", "Nov.", "O.", "O.P.", "OK.", "Oct.", "Okla.", "Ore.", "P.", "P.J.", "P.M", "P.M.", "P.R.", "Pa.", "Penn.", "Pfc.", "Ph.", "Ph.D.", "pro-U.N.", "Prof.", "Prop.", "Pty.", "Q.", "R.", "R.D.", "Ret.", "R.H.", "R.I", "R.I.", "R.L.", "R.P.", "R.R.", "R.W.", "RLV.", "Rd.", "Rep.", "Reps.", "Rev.", "S.", "S.A", "S.A.", "S.C", "S.C.", "S.D.", "S.G.", "S.I.", "S.P.", "S.S.", "S.p", "S.p.A", "S.p.A.", "SKr1.5", "Sen.", "Sens.", "Sept.", "Sgt.", "Snr.", "Spc.", "Sr.", "St.", "Sys.", "T.", "T.D.", "T.F.", "T.T.", "T.V.", "TEL.", "Tech.", "Tenn.", "Tex.", "Tx.", "U.", "U.Cal-Davis", "U.K", "U.K.", "U.N.", "U.S.", "U.S.A", "U.S.A.", "U.S.C.", "U.S.C..", "U.S.S.R", "U.S.S.R.", "UK.", "US116.7", "V.", "V.H.", "VI.", "VII.", "VIII.", "VS.", "Va.", "Vs.", "Vt.", "W.", "W.A.", "W.G.", "W.I.", "W.J.", "W.R.", "W.T.", "W.Va", "W.Va.", "Wash.", "Wis.", "Wyo.", "X.", "Y.", "Y.J.", "Z.", "a.", "a.d.", "a.k.a", "a.m", "a.m.", "al.", "b.", "c.", "c.i.f", "cf.", "cnsl.", "cnsls.", "cont'd.", "d.", "deft.", "defts.", "e.", "et.", "etc.", "etseq.", "f.", "f.o.b", "ft.", "g.", "h.", "i.", "i.e.", "j.", "k.", "l.", "m.", "mots.", "n.", "o.", "p.", "p.m", "p.m.", "pltf.", "pltfs.", "prelim.", "r.", "s.", "seq.", "supp.", "sq.", "t.", "u.", "v.", "vs.", "x.", "y.", "z.")));
        }

        public boolean contains(String str) {
            return this.mAbbrevSet.contains(str.toLowerCase());
        }
    }

    /* loaded from: input_file:edu/stanford/nlp/ie/machinereading/domains/ace/reader/RobustTokenizer$WordToken.class */
    public static class WordToken {
        protected int mStart;
        protected int mEnd;
        protected int mNewLineCount;
        protected String mWord;

        public WordToken(String str, int i, int i2) {
            this.mWord = str;
            this.mStart = i;
            this.mEnd = i2;
            this.mNewLineCount = 0;
        }

        public WordToken(String str, int i, int i2, int i3) {
            this.mWord = str;
            this.mStart = i;
            this.mEnd = i2;
            this.mNewLineCount = i3;
        }

        public String toString() {
            StringBuffer stringBuffer = new StringBuffer();
            stringBuffer.append("[");
            stringBuffer.append(this.mWord);
            stringBuffer.append(", ");
            stringBuffer.append(this.mStart);
            stringBuffer.append(", ");
            stringBuffer.append(this.mEnd);
            stringBuffer.append("]");
            return stringBuffer.toString();
        }

        public int getStart() {
            return this.mStart;
        }

        public void setStart(int i) {
            this.mStart = i;
        }

        public int getEnd() {
            return this.mEnd;
        }

        public void setEnd(int i) {
            this.mEnd = i;
        }

        public int getNewLineCount() {
            return this.mNewLineCount;
        }

        public void setNewLineCount(int i) {
            this.mNewLineCount = i;
        }

        public String getWord() {
            return this.mWord;
        }

        public void setWord(String str) {
            this.mWord = str;
        }
    }

    public RobustTokenizer(String str) {
        this.mAbbreviations = new AbbreviationMap(true);
        this.buffer = str;
        this.cachedTokens = null;
    }

    public RobustTokenizer(boolean z, String str) {
        this.mAbbreviations = new AbbreviationMap(z);
        this.buffer = str;
        this.cachedTokens = null;
    }

    public static String range(String str) {
        return block("[" + str + "]");
    }

    public static String zeroOrOne(String str) {
        return block(block(str) + "?");
    }

    public static String zeroOrMore(String str) {
        return block(block(str) + "*");
    }

    public static String oneOrMore(String str) {
        return block(block(str) + ATBTreeUtils.morphBoundary);
    }

    public static String block(String str) {
        return "(" + str + ")";
    }

    public static String or(String str, String str2) {
        return block(block(str) + "|" + block(str2));
    }

    public static String or(String str, String str2, String str3) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3));
    }

    public static String or(String str, String str2, String str3, String str4) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4));
    }

    public static String or(String str, String str2, String str3, String str4, String str5) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9) + "|" + block(str10));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10, String str11) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9) + "|" + block(str10) + "|" + block(str11));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10, String str11, String str12) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9) + "|" + block(str10) + "|" + block(str11) + "|" + block(str12));
    }

    public static String rangeNot(String str) {
        return range(block("^" + str));
    }

    private static int hasApostropheBlock(String str) {
        for (int length = str.length() - 1; length > 0; length--) {
            if (str.charAt(length) == '\'' && length < str.length() - 1) {
                return length;
            }
            if (!Character.isLetter(str.charAt(length))) {
                return -1;
            }
        }
        return -1;
    }

    private static <T extends WordToken> String concatenate(List<T> list, int i, int i2) {
        StringBuffer stringBuffer = new StringBuffer();
        while (i < i2) {
            stringBuffer.append(((WordToken) list.get(i)).getWord());
            i++;
        }
        return stringBuffer.toString();
    }

    private static <T extends WordToken> int countNewLines(List<T> list, int i, int i2) {
        int i3 = 0;
        for (int i4 = i + 1; i4 < i2; i4++) {
            i3 += ((WordToken) list.get(i4)).getNewLineCount();
        }
        return i3;
    }

    public static boolean isUrl(String str) {
        return urlPattern.matcher(str).find(0);
    }

    public static boolean isEmail(String str) {
        return emailPattern.matcher(str).find(0);
    }

    public static boolean isSgml(String str) {
        return sgmlPattern.matcher(str).find(0);
    }

    public static boolean isSlashDate(String str) {
        return slashDatePattern.matcher(str).find(0);
    }

    public static boolean isAcronym(String str) {
        return acronymPattern.matcher(str).find(0);
    }

    public static boolean isDigitSeq(String str) {
        return digitSeqPattern.matcher(str).find(0);
    }

    public int countNewLines(String str, int i, int i2) {
        int i3 = 0;
        for (int i4 = i; i4 < i2; i4++) {
            if (str.charAt(i4) == '\n') {
                i3++;
            }
        }
        return i3;
    }

    public Word[] tokenizeToWords() {
        List<WordToken> list = tokenizeToWordTokens();
        Word[] wordArr = new Word[list.size()];
        for (int i = 0; i < list.size(); i++) {
            WordToken wordToken = list.get(i);
            wordArr[i] = new Word(wordToken.getWord(), wordToken.getStart(), wordToken.getEnd());
        }
        return wordArr;
    }

    public List<WordToken> tokenizeToWordTokens() {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = wordPattern.matcher(this.buffer);
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                break;
            }
            String group = matcher.group();
            int end = matcher.end();
            int length = end - group.length();
            if (group.endsWith("n't")) {
                if (group.length() > 3) {
                    arrayList.add(new WordToken(group.substring(0, group.length() - 3), length, end - 3, countNewLines(this.buffer, i2, length)));
                }
                arrayList.add(new WordToken(group.substring(group.length() - 3, group.length()), end - 3, end, 0));
            } else {
                int hasApostropheBlock = hasApostropheBlock(group);
                if (hasApostropheBlock != -1) {
                    WordToken wordToken = new WordToken(group.substring(0, hasApostropheBlock), length, length + hasApostropheBlock, countNewLines(this.buffer, i2, length));
                    WordToken wordToken2 = new WordToken(group.substring(hasApostropheBlock, group.length()), length + hasApostropheBlock, end, 0);
                    arrayList.add(wordToken);
                    arrayList.add(wordToken2);
                } else {
                    arrayList.add(new WordToken(group, length, end, countNewLines(this.buffer, i2, length)));
                }
            }
            i = end;
        }
        ArrayList arrayList2 = new ArrayList();
        int i3 = 0;
        while (i3 < arrayList.size()) {
            int size = arrayList.size();
            if (size > i3 + 20) {
                size = i3 + 20;
            }
            boolean z = false;
            while (true) {
                if (size <= i3 + 1) {
                    break;
                }
                WordToken wordToken3 = (WordToken) arrayList.get(i3);
                WordToken wordToken4 = (WordToken) arrayList.get(size - 1);
                if (countNewLines(arrayList, i3, size) == 0) {
                    String concatenate = concatenate(arrayList, i3, size);
                    z = false;
                    if (this.mAbbreviations.contains(concatenate)) {
                        z = true;
                        arrayList2.add(new WordToken(concatenate, wordToken3.getStart(), wordToken4.getEnd(), wordToken3.getNewLineCount()));
                        i3 = size - 1;
                        break;
                    }
                }
                size--;
            }
            if (!z) {
                arrayList2.add(arrayList.get(i3));
            }
            i3++;
        }
        return postprocess(arrayList2);
    }

    protected List<WordToken> postprocess(List<WordToken> list) {
        return list;
    }

    public String tokenizeText() throws IOException {
        List<WordToken> list = tokenizeToWordTokens();
        StringBuffer stringBuffer = new StringBuffer();
        Iterator<WordToken> it = list.iterator();
        if (it.hasNext()) {
            stringBuffer.append(it.next());
        }
        while (it.hasNext()) {
            stringBuffer.append(" ");
            stringBuffer.append(it.next());
        }
        return stringBuffer.toString().replaceAll("\\s\\s+", " ");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* JADX WARN: Can't rename method to resolve collision */
    @Override // edu.stanford.nlp.process.AbstractTokenizer
    public Word getNext() {
        if (this.cachedTokens == null) {
            this.cachedTokens = tokenizeToWords();
            this.cachedPosition = 0;
        }
        if (this.cachedPosition >= this.cachedTokens.length) {
            return null;
        }
        Word word = this.cachedTokens[this.cachedPosition];
        this.cachedPosition++;
        return word;
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 1) {
            System.err.println("Usage: java edu.stanford.nlp.ie.machinereading.common.RobustTokenizer <file to tokenize>");
            System.exit(1);
        }
        BufferedReader bufferedReader = new BufferedReader(new FileReader(strArr[0]));
        StringBuffer stringBuffer = new StringBuffer();
        while (true) {
            int read = bufferedReader.read();
            if (read == -1) {
                break;
            } else {
                stringBuffer.append((char) read);
            }
        }
        Iterator<Word> it = new RobustTokenizer(stringBuffer.toString()).tokenize().iterator();
        while (it.hasNext()) {
            System.out.println(it.next());
        }
    }
}
