nlp

Type Members

trait BackoffIndexer[WordType, NGramType] extends NGramIndexer[WordType, NGramType]

A family of NGramIndexer that can unpack or strip off specific words, query the order of an packed ngram, etc.
case class CoreNLPFeatureExtractor(orders: Seq[Int]) extends Transformer[String, Seq[String]] with Product with Serializable

Transformer that uses CoreNLP to (in order): - Tokenize document - Lemmatize tokens - Replace entities w/ their type (e.
case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] with Product with Serializable

Converts a sequence of terms to a sparse vector representing their frequencies, using the hashing trick: https://en.
class InitialBigramPartitioner[WordType] extends Partitioner

Partitions each ngram by hashing on its first two words (first as in farthest away from the current word), then mod by numPartitions.
case class LowerCase(locale: Locale = java.util.Locale.getDefault()) extends Transformer[String, String] with Product with Serializable

Transformer that converts a String to lower case
class NGram[T] extends Serializable

An NGram representation that is a thin wrapper over Array[String].
trait NGramIndexer[WordType, NGramType] extends Serializable
class NGramIndexerImpl[T] extends BackoffIndexer[T, NGram[T]]
case class NGramsCounts[T](mode: NGramsCountsMode.Value = NGramsCountsMode.Default)(implicit evidence$3: ClassTag[T]) extends FunctionNode[RDD[Seq[Seq[T]]], RDD[(NGram[T], Int)]] with Product with Serializable

A simple transformer that represents each ngram as an NGram and counts their occurrence.
case class NGramsFeaturizer[T](orders: Seq[Int])(implicit evidence$1: ClassTag[T]) extends Transformer[Seq[T], Seq[Seq[T]]] with Product with Serializable

An ngram featurizer.
case class NGramsHashingTF(orders: Seq[Int], numFeatures: Int) extends Transformer[Seq[String], SparseVector[Double]] with Product with Serializable

Converts the n-grams of a sequence of terms to a sparse vector representing their frequencies, using the hashing trick: https://en.
case class StupidBackoffEstimator[T](unigramCounts: Map[T, Int], alpha: Double = 0.4)(implicit evidence$3: ClassTag[T]) extends Estimator[(NGram[T], Int), (NGram[T], Double)] with Product with Serializable

Estimates a Stupid Backoff ngram language model, which was introduced in the following paper:
class StupidBackoffModel[T] extends Transformer[(NGram[T], Int), (NGram[T], Double)]
case class Tokenizer(sep: String = "[\\p{Punct}\\s]+") extends Transformer[String, Seq[String]] with Product with Serializable

Transformer that tokenizes a String into a Seq[String] by splitting on a regular expression.
class WordFrequencyTransformer extends Transformer[Seq[String], Seq[Int]]

Encodes string tokens as non-negative integers, which are indices of the tokens' positions in the sorted-by-frequency order.

Value Members

object NGramsCountsMode extends Enumeration

Control flags used for NGramsCounts.
object NaiveBitPackIndexer extends BackoffIndexer[Int, Long]

Packs up to 3 words (trigrams) into a single Long by bit packing.
object Trim extends Transformer[String, String]

Transformer that trims a String of leading and trailing whitespace
object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]]

package nlp

Type Members

trait BackoffIndexer[WordType, NGramType] extends NGramIndexer[WordType, NGramType]

case class CoreNLPFeatureExtractor(orders: Seq[Int]) extends Transformer[String, Seq[String]] with Product with Serializable

case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] with Product with Serializable

class InitialBigramPartitioner[WordType] extends Partitioner

case class LowerCase(locale: Locale = java.util.Locale.getDefault()) extends Transformer[String, String] with Product with Serializable

class NGram[T] extends Serializable

trait NGramIndexer[WordType, NGramType] extends Serializable

class NGramIndexerImpl[T] extends BackoffIndexer[T, NGram[T]]

case class NGramsCounts[T](mode: NGramsCountsMode.Value = NGramsCountsMode.Default)(implicit evidence$3: ClassTag[T]) extends FunctionNode[RDD[Seq[Seq[T]]], RDD[(NGram[T], Int)]] with Product with Serializable

case class NGramsFeaturizer[T](orders: Seq[Int])(implicit evidence$1: ClassTag[T]) extends Transformer[Seq[T], Seq[Seq[T]]] with Product with Serializable

case class NGramsHashingTF(orders: Seq[Int], numFeatures: Int) extends Transformer[Seq[String], SparseVector[Double]] with Product with Serializable

case class StupidBackoffEstimator[T](unigramCounts: Map[T, Int], alpha: Double = 0.4)(implicit evidence$3: ClassTag[T]) extends Estimator[(NGram[T], Int), (NGram[T], Double)] with Product with Serializable

class StupidBackoffModel[T] extends Transformer[(NGram[T], Int), (NGram[T], Double)]

case class Tokenizer(sep: String = "[\\p{Punct}\\s]+") extends Transformer[String, Seq[String]] with Product with Serializable

class WordFrequencyTransformer extends Transformer[Seq[String], Seq[Int]]

Value Members

object NGramsCountsMode extends Enumeration

object NaiveBitPackIndexer extends BackoffIndexer[Int, Long]

object Trim extends Transformer[String, String]

object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]]

Ungrouped