2016-10-03 21:19:13 +03:00
|
|
|
|
//- ----------------------------------
|
|
|
|
|
//- 💫 DOCS > API > LEXEME
|
|
|
|
|
//- ----------------------------------
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+section("lexeme")
|
|
|
|
|
+h(2, "lexeme", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/lexeme.pyx")
|
|
|
|
|
| #[+tag class] Lexeme
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
p.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
The Lexeme object represents a lexical type, stored in the vocabulary –
|
2016-03-31 17:24:48 +03:00
|
|
|
|
as opposed to a token, occurring in a document.
|
|
|
|
|
|
|
|
|
|
p.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
Each Token object receives a reference to a lexeme object (specifically,
|
|
|
|
|
it receives a pointer to a #[code LexemeC] struct). This allows features
|
|
|
|
|
to be computed and saved once per type, rather than once per token. As
|
|
|
|
|
job sizes grow, this amounts to substantial efficiency improvements, as
|
|
|
|
|
the vocabulary size (number of types) will be much smaller than the total
|
2016-03-31 17:24:48 +03:00
|
|
|
|
number of words processed (number of tokens).
|
|
|
|
|
|
|
|
|
|
p.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
All Lexeme attributes are therefore context independent, as a single lexeme
|
|
|
|
|
is reused for all usages of that word. Lexemes are keyed by the #[code orth]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
attribute.
|
|
|
|
|
|
|
|
|
|
p.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
Most Lexeme attributes can be set, with the exception of the primary key,
|
|
|
|
|
#[code orth]. Assigning to an attribute of the #[code Lexeme] object writes
|
|
|
|
|
to the underlying struct, so all tokens that are backed by that
|
2016-03-31 17:24:48 +03:00
|
|
|
|
#[code Lexeme] will inherit the new value.
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+code("python", "Overview").
|
2016-03-31 17:24:48 +03:00
|
|
|
|
class Lexeme:
|
|
|
|
|
def __init__(self, vocab, key):
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
int rank
|
2016-10-03 21:19:13 +03:00
|
|
|
|
|
2016-03-31 17:24:48 +03:00
|
|
|
|
int orth, lower, shape, prefix, suffix
|
|
|
|
|
|
|
|
|
|
unicode orth_, lower_, shape_, prefix_, suffix_
|
|
|
|
|
|
|
|
|
|
bool is_alpha, is_ascii, is_lower, is_title, is_punct, is_space, like_url, like_num, like_email, is_oov, is_stop
|
|
|
|
|
|
|
|
|
|
float prob
|
|
|
|
|
int cluster
|
|
|
|
|
numpy.ndarray[float64] vector
|
|
|
|
|
bool has_vector
|
|
|
|
|
|
|
|
|
|
def set_flag(self, flag_id, value):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def check_flag(self, flag_id):
|
|
|
|
|
return bool
|
|
|
|
|
|
|
|
|
|
def similarity(self, other):
|
|
|
|
|
return float
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+table(["Example", "Description"])
|
2016-03-31 17:24:48 +03:00
|
|
|
|
+row
|
|
|
|
|
+cell #[code.lang-python lexeme = nlp.vocab[string]]
|
|
|
|
|
+cell Lookup by string
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code.lang-python lexeme = vocab[i]]
|
|
|
|
|
+cell Lookup by integer
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+section("lexeme-stringfeatures")
|
|
|
|
|
+h(3, "lexeme-stringfeatures").
|
2016-03-31 17:24:48 +03:00
|
|
|
|
String Features
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+table(["Name", "Description"])
|
2016-03-31 17:24:48 +03:00
|
|
|
|
+row
|
|
|
|
|
+cell orth / orth_
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
The form of the word with no string normalization or processing,
|
2016-03-31 17:24:48 +03:00
|
|
|
|
as it appears in the string, without trailing whitespace.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell lower / lower_
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
The form of the word, but forced to lower-case, i.e.
|
2016-03-31 17:24:48 +03:00
|
|
|
|
#[code lower = word.orth_.lower()]
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell shape / shape_
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
A transform of the word's string, to show orthographic features.
|
|
|
|
|
The characters a-z are mapped to x, A-Z is mapped to X, 0-9
|
|
|
|
|
is mapped to d. After these mappings, sequences of 4 or more
|
|
|
|
|
of the same character are truncated to length 4. Examples:
|
2016-03-31 17:24:48 +03:00
|
|
|
|
C3Po --> XdXx, favorite --> xxxx, :) --> :)
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell prefix / prefix_
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
A length-N substring from the start of the word. Length may
|
|
|
|
|
vary by language; currently for English n=1, i.e.
|
2016-03-31 17:24:48 +03:00
|
|
|
|
#[code prefix = word.orth_[:1]]
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell suffix / suffix_
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
A length-N substring from the end of the word. Length may vary
|
|
|
|
|
by language; currently for English n=3, i.e.
|
2016-03-31 17:24:48 +03:00
|
|
|
|
#[code suffix = word.orth_[-3:]]
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+section("lexeme-booleanflags")
|
|
|
|
|
+h(3, "lexeme-booleanflags")
|
2016-03-31 17:24:48 +03:00
|
|
|
|
| Boolean Flags
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+table(["Name", "Description"])
|
2016-03-31 17:24:48 +03:00
|
|
|
|
+row
|
|
|
|
|
+cell is_alpha
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to #[code word.orth_.isalpha()]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_ascii
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to any(ord(c) >= 128 for c in word.orth_)]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_digit
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to #[code word.orth_.isdigit()]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_lower
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to #[code word.orth_.islower()]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_title
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to #[code word.orth_.istitle()]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_punct
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to #[code word.orth_.ispunct()]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_space
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Equivalent to #[code word.orth_.isspace()]
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell like_url
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Does the word resemble a URL?
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell like_num
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell like_email
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Does the word resemble an email?
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_oov
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+cell Is the word out-of-vocabulary?
|
2016-03-31 17:24:48 +03:00
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell is_stop
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
Is the word part of a "stop list"? Stop lists are used to
|
|
|
|
|
improve the quality of topic models, by filtering out common,
|
2016-03-31 17:24:48 +03:00
|
|
|
|
domain-general words.
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+section("lexeme-distributional")
|
|
|
|
|
+h(3, "lexeme-distributional")
|
2016-03-31 17:24:48 +03:00
|
|
|
|
| Distributional Features
|
|
|
|
|
|
2016-10-03 21:19:13 +03:00
|
|
|
|
+table(["Name", "Description"])
|
2016-03-31 17:24:48 +03:00
|
|
|
|
+row
|
|
|
|
|
+cell prob
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
The unigram log-probability of the word, estimated from
|
|
|
|
|
counts from a large corpus, smoothed using Simple Good Turing
|
2016-03-31 17:24:48 +03:00
|
|
|
|
estimation.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell cluster
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
The Brown cluster ID of the word. These are often useful features
|
|
|
|
|
for linear models. If you’re using a non-linear model, particularly
|
|
|
|
|
a neural net or random forest, consider using the real-valued
|
2016-03-31 17:24:48 +03:00
|
|
|
|
word representation vector, in #[code Token.repvec], instead.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell vector
|
|
|
|
|
+cell.
|
2016-10-03 21:19:13 +03:00
|
|
|
|
A "word embedding" representation: a dense real-valued vector
|
|
|
|
|
that supports similarity queries between words. By default,
|
|
|
|
|
spaCy currently loads vectors produced by the Levy and
|
2016-03-31 17:24:48 +03:00
|
|
|
|
Goldberg (2014) dependency-based word2vec model.
|
|
|
|
|
|
|
|
|
|
+row
|
|
|
|
|
+cell has_vector
|
|
|
|
|
+cell.
|
|
|
|
|
A boolean value indicating whether a vector.
|