/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.PTB2TextLexer;
import edu.stanford.nlp.process.PTBLexer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

public class PTBTokenizer<T extends HasWord>
extends AbstractTokenizer<T> {
    private PTBLexer lexer;

    public static PTBTokenizer<Word> newPTBTokenizer(Reader r) {
        return PTBTokenizer.newPTBTokenizer(r, false);
    }

    public static PTBTokenizer<Word> newPTBTokenizer(Reader r, boolean tokenizeNLs) {
        return new PTBTokenizer<Word>(r, tokenizeNLs, false, false, new WordTokenFactory());
    }

    public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
        return new PTBTokenizer<CoreLabel>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
    }

    private PTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> tokenFactory) {
        StringBuilder options = new StringBuilder();
        if (suppressEscaping) {
            options.append("ptb3Escaping=false");
        } else {
            options.append("ptb3Escaping=true");
        }
        if (tokenizeNLs) {
            options.append(",tokenizeNLs");
        }
        if (invertible) {
            options.append(",invertible");
        }
        this.lexer = new PTBLexer(r, tokenFactory, options.toString());
    }

    public PTBTokenizer(Reader r, LexedTokenFactory<T> tokenFactory, String options) {
        this.lexer = new PTBLexer(r, tokenFactory, options);
    }

    @Override
    protected T getNext() {
        try {
            return (T)((HasWord)this.lexer.next());
        }
        catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }

    public static String getNewlineToken() {
        return "*NL*";
    }

    public static String ptb2Text(String ptbText) {
        StringBuilder sb = new StringBuilder(ptbText.length());
        PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
        try {
            String token;
            while ((token = lexer.next()) != null) {
                sb.append(token);
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

    public static String ptbToken2Text(String ptbText) {
        return PTBTokenizer.ptb2Text(' ' + ptbText + ' ').trim();
    }

    public static int ptb2Text(Reader ptbText, Writer w) throws IOException {
        String token;
        int numTokens = 0;
        PTB2TextLexer lexer = new PTB2TextLexer(ptbText);
        while ((token = lexer.next()) != null) {
            ++numTokens;
            w.write(token);
        }
        return numTokens;
    }

    private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException {
        Timing t = new Timing();
        int numTokens = 0;
        int sz = inputFileList.size();
        if (sz == 0) {
            InputStreamReader r = new InputStreamReader(System.in, charset);
            PrintWriter out2 = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, charset), true);
            numTokens = PTBTokenizer.ptb2Text(r, out2);
            out2.close();
        } else {
            for (int j = 0; j < sz; ++j) {
                BufferedReader r = IOUtils.readReaderFromString(inputFileList.get(j), charset);
                PrintWriter out3 = outputFileList == null ? new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, charset), true) : new PrintWriter((Writer)new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(outputFileList.get(j)), charset)), true);
                numTokens += PTBTokenizer.ptb2Text(r, out3);
                out3.close();
            }
        }
        long millis = t.stop();
        double wordspersec = (double)numTokens / ((double)millis / 1000.0);
        System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.", numTokens, wordspersec);
    }

    public static String ptb2Text(List<String> ptbWords) {
        return PTBTokenizer.ptb2Text(StringUtils.join(ptbWords));
    }

    public static String labelList2Text(List<? extends HasWord> ptbWords) {
        ArrayList<String> words = new ArrayList<String>();
        for (HasWord hasWord : ptbWords) {
            words.add(hasWord.word());
        }
        return PTBTokenizer.ptb2Text(words);
    }

    private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException {
        long numTokens = 0L;
        int numFiles = inputFileList.size();
        long start = System.nanoTime();
        if (numFiles == 0) {
            String line;
            BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in, charset));
            PrintWriter out2 = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, charset), true);
            while ((line = stdin.readLine()) != null) {
                numTokens += (long)PTBTokenizer.tokReader(new StringReader(line), out2, parseInsideBegin, parseInsideEnd, options, preserveLines, dump, lowerCase);
                if (!preserveLines) continue;
                out2.println();
            }
            IOUtils.closeIgnoringExceptions(out2);
        } else {
            for (int j = 0; j < numFiles; ++j) {
                BufferedReader r = IOUtils.readReaderFromString(inputFileList.get(j), charset);
                PrintWriter out3 = outputFileList == null ? new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, charset), true) : new PrintWriter((Writer)new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(outputFileList.get(j)), charset)), true);
                numTokens += (long)PTBTokenizer.tokReader(r, out3, parseInsideBegin, parseInsideEnd, options, preserveLines, dump, lowerCase);
                ((Reader)r).close();
                if (j != numFiles - 1 && outputFileList == null) continue;
                IOUtils.closeIgnoringExceptions(out3);
            }
        }
        long duration = System.nanoTime() - start;
        double wordsPerSec = (double)numTokens / ((double)duration / 1.0E9);
        System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
    }

    private static int tokReader(Reader r, PrintWriter out2, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump, boolean lowerCase) {
        int numTokens = 0;
        boolean printing = parseInsideBegin == null;
        boolean beginLine = true;
        PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options);
        while (tokenizer.hasNext()) {
            String str;
            CoreLabel obj = (CoreLabel)tokenizer.next();
            String origStr = (String)obj.get(CoreAnnotations.TextAnnotation.class);
            if (lowerCase) {
                str = origStr.toLowerCase(Locale.ENGLISH);
                obj.set(CoreAnnotations.TextAnnotation.class, str);
            } else {
                str = origStr;
            }
            if (parseInsideBegin != null && parseInsideBegin.matcher(origStr).matches()) {
                printing = true;
            } else if (parseInsideEnd != null && parseInsideEnd.matcher(origStr).matches()) {
                printing = false;
            } else if (printing) {
                if (dump) {
                    str = obj.toString();
                }
                if (preserveLines) {
                    if ("*NL*".equals(origStr)) {
                        beginLine = true;
                        out2.println();
                    } else {
                        if (!beginLine) {
                            out2.print(" ");
                        } else {
                            beginLine = false;
                        }
                        out2.print(str);
                    }
                } else {
                    out2.println(str);
                }
            }
            ++numTokens;
        }
        return numTokens;
    }

    public static TokenizerFactory<Word> factory() {
        return PTBTokenizerFactory.newTokenizerFactory();
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(boolean tokenizeNLs, LexedTokenFactory<T> factory) {
        return new PTBTokenizerFactory(tokenizeNLs, false, false, factory);
    }

    public static TokenizerFactory<CoreLabel> factory(boolean tokenizeNLs, boolean invertible) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeNLs, invertible);
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) {
        return new PTBTokenizerFactory(factory, options);
    }

    private static Map<String, Integer> optionArgDefs() {
        HashMap<String, Integer> optionArgDefs = new HashMap<String, Integer>();
        optionArgDefs.put("options", 1);
        optionArgDefs.put("ioFileList", 0);
        optionArgDefs.put("lowerCase", 0);
        optionArgDefs.put("dump", 0);
        optionArgDefs.put("untok", 0);
        optionArgDefs.put("encoding", 1);
        optionArgDefs.put("parseInside", 1);
        optionArgDefs.put("preserveLines", 0);
        return optionArgDefs;
    }

    public static void main(String[] args) throws IOException {
        String parsedArgStr;
        boolean preserveLines;
        Properties options = StringUtils.argsToProperties(args, PTBTokenizer.optionArgDefs());
        boolean showHelp = PropertiesUtils.getBool(options, "help", false);
        if (showHelp = PropertiesUtils.getBool(options, "h", showHelp)) {
            System.err.println("Usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
            System.err.println("  options: -h|-preserveLines|-lowerCase|-dump|-ioFileList|-encoding|-parseInside|-options");
            System.exit(0);
        }
        StringBuilder optionsSB = new StringBuilder();
        String tokenizerOptions = options.getProperty("options", null);
        if (tokenizerOptions != null) {
            optionsSB.append(tokenizerOptions);
        }
        if (preserveLines = PropertiesUtils.getBool(options, "preserveLines", false)) {
            optionsSB.append(",tokenizeNLs");
        }
        boolean inputOutputFileList = PropertiesUtils.getBool(options, "ioFileList", false);
        boolean lowerCase = PropertiesUtils.getBool(options, "lowerCase", false);
        boolean dump = PropertiesUtils.getBool(options, "dump", false);
        boolean untok = PropertiesUtils.getBool(options, "untok", false);
        String charset = options.getProperty("encoding", "utf-8");
        String parseInsideKey = options.getProperty("parseInside", null);
        Pattern parseInsideBegin = null;
        Pattern parseInsideEnd = null;
        if (parseInsideKey != null) {
            try {
                parseInsideBegin = Pattern.compile("<(?:" + parseInsideKey + ")(?:\\s[^>]*?)?>");
                parseInsideEnd = Pattern.compile("</(?:" + parseInsideKey + ")(?:\\s[^>]*?)?>");
            }
            catch (Exception e) {
                parseInsideBegin = null;
                parseInsideEnd = null;
            }
        }
        String[] parsedArgs = (parsedArgStr = options.getProperty("", null)) == null ? null : parsedArgStr.split("\\s+");
        ArrayList<String> inputFileList = new ArrayList<String>();
        ArrayList<String> outputFileList = null;
        if (inputOutputFileList && parsedArgs != null) {
            outputFileList = new ArrayList<String>();
            for (String fileName : parsedArgs) {
                String inLine;
                BufferedReader r = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(fileName), charset));
                while ((inLine = r.readLine()) != null) {
                    String[] fields = inLine.split("\\s+");
                    inputFileList.add(fields[0]);
                    if (fields.length > 1) {
                        outputFileList.add(fields[1]);
                        continue;
                    }
                    outputFileList.add(fields[0] + ".tok");
                }
                r.close();
            }
        } else if (parsedArgs != null) {
            inputFileList.addAll(Arrays.asList(parsedArgs));
        }
        if (untok) {
            PTBTokenizer.untok(inputFileList, outputFileList, charset);
        } else {
            PTBTokenizer.tok(inputFileList, outputFileList, charset, parseInsideBegin, parseInsideEnd, optionsSB.toString(), preserveLines, dump, lowerCase);
        }
    }

    public static class PTBTokenizerFactory<T extends HasWord>
    implements TokenizerFactory<T> {
        protected LexedTokenFactory<T> factory;
        protected String options;

        public static TokenizerFactory<Word> newTokenizerFactory() {
            return PTBTokenizerFactory.newPTBTokenizerFactory(new WordTokenFactory(), "");
        }

        public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean tokenizeNLs) {
            return new PTBTokenizerFactory<Word>(tokenizeNLs, false, false, new WordTokenFactory());
        }

        public static PTBTokenizerFactory<Word> newWordTokenizerFactory(String options) {
            return new PTBTokenizerFactory<Word>(new WordTokenFactory(), options);
        }

        public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
            return new PTBTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), options);
        }

        public static <T extends HasWord> PTBTokenizerFactory<T> newPTBTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
            return new PTBTokenizerFactory<T>(tokenFactory, options);
        }

        public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeNLs, boolean invertible) {
            return new PTBTokenizerFactory<CoreLabel>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
        }

        private PTBTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) {
            this.factory = factory;
            StringBuilder optionsSB = new StringBuilder();
            if (suppressEscaping) {
                optionsSB.append("ptb3Escaping=false");
            } else {
                optionsSB.append("ptb3Escaping=true");
            }
            if (tokenizeNLs) {
                optionsSB.append(",tokenizeNLs");
            }
            if (invertible) {
                optionsSB.append(",invertible");
            }
            this.options = optionsSB.toString();
        }

        private PTBTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
            this.factory = tokenFactory;
            this.options = options;
        }

        @Override
        public Iterator<T> getIterator(Reader r) {
            return this.getTokenizer(r);
        }

        @Override
        public Tokenizer<T> getTokenizer(Reader r) {
            return new PTBTokenizer<T>(r, this.factory, this.options);
        }

        @Override
        public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
            if (this.options == null || "".equals(this.options)) {
                return new PTBTokenizer<T>(r, this.factory, extraOptions);
            }
            return new PTBTokenizer<T>(r, this.factory, this.options + ',' + extraOptions);
        }

        @Override
        public void setOptions(String options) {
            this.options = options;
        }
    }
}

