package opennlp.tools.tokenize;

import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.util.Span;

/* loaded from: input_file:lib/opennlp-tools-2.5.4.jar:opennlp/tools/tokenize/WordpieceTokenizer.class */
public class WordpieceTokenizer implements Tokenizer {
    private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
    private static final String CLASSIFICATION_TOKEN = "[CLS]";
    private static final String SEPARATOR_TOKEN = "[SEP]";
    private static final String UNKNOWN_TOKEN = "[UNK]";
    private final Set<String> vocabulary;
    private int maxTokenLength;

    public WordpieceTokenizer(Set<String> set) {
        this.maxTokenLength = 50;
        this.vocabulary = set;
    }

    public WordpieceTokenizer(Set<String> set, int i) {
        this(set);
        this.maxTokenLength = i;
    }

    @Override // opennlp.tools.tokenize.Tokenizer
    public Span[] tokenizePos(String str) {
        return null;
    }

    @Override // opennlp.tools.tokenize.Tokenizer
    public String[] tokenize(String str) {
        LinkedList linkedList = new LinkedList();
        linkedList.add(CLASSIFICATION_TOKEN);
        for (String str2 : WhitespaceTokenizer.INSTANCE.tokenize(PUNCTUATION_PATTERN.matcher(str).replaceAll(" $0 "))) {
            char[] charArray = str2.toCharArray();
            if (charArray.length <= this.maxTokenLength) {
                int i = 0;
                while (true) {
                    int i2 = i;
                    if (i2 < charArray.length) {
                        int length = charArray.length;
                        boolean z = false;
                        while (true) {
                            if (i2 >= length) {
                                break;
                            }
                            String valueOf = String.valueOf(charArray, i2, length - i2);
                            if (i2 > 0) {
                                valueOf = "##" + valueOf;
                            }
                            if (this.vocabulary.contains(valueOf)) {
                                linkedList.add(valueOf);
                                z = true;
                                break;
                            }
                            length--;
                        }
                        if (!z) {
                            linkedList.add(UNKNOWN_TOKEN);
                            break;
                        }
                        i = length;
                    }
                }
            } else {
                linkedList.add(UNKNOWN_TOKEN);
            }
        }
        linkedList.add(SEPARATOR_TOKEN);
        return (String[]) linkedList.toArray(new String[0]);
    }

    public int getMaxTokenLength() {
        return this.maxTokenLength;
    }
}
