package de.fau.cs.jstk.arch;

import de.fau.cs.jstk.exceptions.OutOfVocabularyException;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/* loaded from: input_file:de/fau/cs/jstk/arch/Tokenizer.class */
public final class Tokenizer {
    private Alphabet alphabet;
    public LinkedList<Tokenization> tokenizations;
    public static final String SYNOPSIS = "sikoried, 3/6/2010\nThe Tokenizer can  be used to verify a lexicon using alphabet and tokenization file.\nusage: Tokenizer alphabet tokenization1 [tokenization2 ...]\n";

    public Tokenizer(Alphabet alphabet) {
        this.tokenizations = new LinkedList<>();
        this.alphabet = alphabet;
    }

    public Tokenizer(Alphabet alphabet, File file) throws IOException {
        this(alphabet);
        addTokenizationsFromFile(file);
    }

    public String[] tokenize(String str) throws OutOfVocabularyException {
        int binarySearch = Collections.binarySearch(this.tokenizations, new Tokenization(str));
        if (binarySearch < 0) {
            throw new OutOfVocabularyException(str);
        }
        return this.tokenizations.get(binarySearch).sequence;
    }

    public boolean validate(String str) {
        return Collections.binarySearch(this.tokenizations, new Tokenization(str, new String[0])) >= 0;
    }

    public String toString() {
        return "Tokenizer with " + this.tokenizations.size() + " tokenizations";
    }

    public void dump(PrintStream printStream) {
        Iterator<Tokenization> it = this.tokenizations.iterator();
        while (it.hasNext()) {
            printStream.println(it.next());
        }
    }

    public void addTokenization(Tokenization tokenization) {
        this.tokenizations.add(tokenization);
    }

    public void sortTokenizations() {
        Collections.sort(this.tokenizations);
    }

    public Tokenization getWordTokenization(String str) throws OutOfVocabularyException {
        Iterator<Tokenization> it = this.tokenizations.iterator();
        while (it.hasNext()) {
            Tokenization next = it.next();
            if (next.word.equals(str)) {
                return next;
            }
        }
        throw new OutOfVocabularyException(str);
    }

    public List<Tokenization> getSentenceTokenization(String str) throws OutOfVocabularyException {
        String[] split = str.split("\\s+");
        LinkedList linkedList = new LinkedList();
        for (String str2 : split) {
            linkedList.add(getWordTokenization(str2));
        }
        return linkedList;
    }

    public void addTokenization(String str, String str2) throws IOException {
        addTokenization(new Tokenization(String.valueOf(str) + " " + str2, this.alphabet));
    }

    public void addTokenizationsFromFile(File file) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                sortTokenizations();
                return;
            }
            addTokenization(new Tokenization(readLine, this.alphabet));
        }
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length != 2) {
            System.err.println(SYNOPSIS);
            System.exit(1);
        }
        new Tokenizer(new Alphabet(new File(strArr[0])), new File(strArr[1])).dump(System.out);
    }
}
