public class NGramUtils
extends java.lang.Object
| Constructor and Description |
|---|
NGramUtils() |
| Modifier and Type | Method and Description |
|---|---|
static double |
calculateBigramMLProbability(java.lang.String x0,
java.lang.String x1,
java.util.Collection<StringList> set)
calculate the probability of a bigram in a vocabulary using maximum likelihood estimation
|
static double |
calculateBigramPriorSmoothingProbability(java.lang.String x0,
java.lang.String x1,
java.util.Collection<StringList> set,
java.lang.Double k)
calculate the probability of a bigram in a vocabulary using prior Laplace smoothing algorithm
|
static double |
calculateLaplaceSmoothingProbability(StringList ngram,
java.lang.Iterable<StringList> set,
java.lang.Double k)
calculate the probability of a ngram in a vocabulary using Laplace smoothing algorithm
|
static double |
calculateMissingNgramProbabilityMass(StringList ngram,
java.lang.Double discount,
java.lang.Iterable<StringList> set)
calculate the probability of a ngram in a vocabulary using the missing probability mass algorithm
|
static double |
calculateNgramMLProbability(StringList ngram,
java.lang.Iterable<StringList> set)
calculate the probability of a ngram in a vocabulary using maximum likelihood estimation
|
static double |
calculateTrigramLinearInterpolationProbability(java.lang.String x0,
java.lang.String x1,
java.lang.String x2,
java.util.Collection<StringList> set,
java.lang.Double lambda1,
java.lang.Double lambda2,
java.lang.Double lambda3)
calculate the probability of a trigram in a vocabulary using a linear interpolation algorithm
|
static double |
calculateTrigramMLProbability(java.lang.String x0,
java.lang.String x1,
java.lang.String x2,
java.lang.Iterable<StringList> set)
calculate the probability of a trigram in a vocabulary using maximum likelihood estimation
|
static double |
calculateUnigramMLProbability(java.lang.String word,
java.util.Collection<StringList> set)
calculate the probability of a unigram in a vocabulary using maximum likelihood estimation
|
static java.util.Collection<java.lang.String[]> |
getNGrams(java.lang.String[] sequence,
int size)
Get the ngrams of dimension n of a certain input sequence of tokens.
|
static java.util.Collection<StringList> |
getNGrams(StringList sequence,
int size)
Get the ngrams of dimension n of a certain input sequence of tokens.
|
static StringList |
getNMinusOneTokenFirst(StringList ngram)
get the (n-1)th ngram of a given ngram, that is the same ngram except the last word in the ngram
|
static StringList |
getNMinusOneTokenLast(StringList ngram)
get the (n-1)th ngram of a given ngram, that is the same ngram except the first word in the ngram
|
public static double calculateLaplaceSmoothingProbability(StringList ngram, java.lang.Iterable<StringList> set, java.lang.Double k)
ngram - the ngram to get the probability forset - the vocabularyk - the smoothing factorpublic static double calculateUnigramMLProbability(java.lang.String word,
java.util.Collection<StringList> set)
word - the only word in the unigramset - the vocabularypublic static double calculateBigramMLProbability(java.lang.String x0,
java.lang.String x1,
java.util.Collection<StringList> set)
x0 - first word in the bigramx1 - second word in the bigramset - the vocabularypublic static double calculateTrigramMLProbability(java.lang.String x0,
java.lang.String x1,
java.lang.String x2,
java.lang.Iterable<StringList> set)
x0 - first word in the trigramx1 - second word in the trigramx2 - third word in the trigramset - the vocabularypublic static double calculateNgramMLProbability(StringList ngram, java.lang.Iterable<StringList> set)
ngram - a ngramset - the vocabularypublic static double calculateBigramPriorSmoothingProbability(java.lang.String x0,
java.lang.String x1,
java.util.Collection<StringList> set,
java.lang.Double k)
x0 - the first word in the bigramx1 - the second word in the bigramset - the vocabularyk - the smoothing factorpublic static double calculateTrigramLinearInterpolationProbability(java.lang.String x0,
java.lang.String x1,
java.lang.String x2,
java.util.Collection<StringList> set,
java.lang.Double lambda1,
java.lang.Double lambda2,
java.lang.Double lambda3)
x0 - the first word in the trigramx1 - the second word in the trigramx2 - the third word in the trigramset - the vocabularylambda1 - trigram interpolation factorlambda2 - bigram interpolation factorlambda3 - unigram interpolation factorpublic static double calculateMissingNgramProbabilityMass(StringList ngram, java.lang.Double discount, java.lang.Iterable<StringList> set)
ngram - the ngramdiscount - discount factorset - the vocabularypublic static StringList getNMinusOneTokenFirst(StringList ngram)
ngram - a ngrampublic static StringList getNMinusOneTokenLast(StringList ngram)
ngram - a ngrampublic static java.util.Collection<StringList> getNGrams(StringList sequence, int size)
sequence - a sequence of tokenssize - the size of the resulting ngrmamspublic static java.util.Collection<java.lang.String[]> getNGrams(java.lang.String[] sequence,
int size)
sequence - a sequence of tokenssize - the size of the resulting ngrmamsCopyright © 2010 - 2023 Adobe. All Rights Reserved