Tidy up Lexeme and update docs

This commit is contained in:
ines 2017-10-27 21:07:50 +02:00
parent ba5e646219
commit a8e10f94e4
3 changed files with 337 additions and 104 deletions

View File

@ -2,27 +2,17 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
np.import_array() np.import_array()
from libc.string cimport memset from libc.string cimport memset
import numpy import numpy
from .typedefs cimport attr_t, flags_t from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from .attrs cimport IS_QUOTE
from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV
from . import about from . import about
@ -32,8 +22,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme: cdef class Lexeme:
"""An entry in the vocabulary. A `Lexeme` has no string context it's a """An entry in the vocabulary. A `Lexeme` has no string context it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the
tag). part-of-speech tag).
""" """
def __init__(self, Vocab vocab, attr_t orth): def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object. """Create a Lexeme object.
@ -60,17 +50,17 @@ cdef class Lexeme:
else: else:
a = 0 a = 0
b = 1 b = 1
if op == 2: # == if op == 2: # ==
return a == b return a == b
elif op == 3: # != elif op == 3: # !=
return a != b return a != b
elif op == 0: # < elif op == 0: # <
return a < b return a < b
elif op == 1: # <= elif op == 1: # <=
return a <= b return a <= b
elif op == 4: # > elif op == 4: # >
return a > b return a > b
elif op == 5: # >= elif op == 5: # >=
return a >= b return a >= b
else: else:
raise NotImplementedError(op) raise NotImplementedError(op)
@ -104,7 +94,8 @@ cdef class Lexeme:
""" """
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm))
def to_bytes(self): def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c) lex_data = Lexeme.c_to_bytes(self.c)
@ -130,19 +121,13 @@ cdef class Lexeme:
self.orth = self.c.orth self.orth = self.c.orth
property has_vector: property has_vector:
"""A boolean value indicating whether a word vector is associated with """RETURNS (bool): Whether a word vector is associated with the object.
the object.
RETURNS (bool): Whether a word vector is associated with the object.
""" """
def __get__(self): def __get__(self):
return self.vocab.has_vector(self.c.orth) return self.vocab.has_vector(self.c.orth)
property vector_norm: property vector_norm:
"""The L2 norm of the lexeme's vector representation. """RETURNS (float): The L2 norm of the vector representation."""
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
vector = self.vector vector = self.vector
return numpy.sqrt((vector**2).sum()) return numpy.sqrt((vector**2).sum())
@ -169,149 +154,320 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector) self.vocab.set_vector(self.c.orth, vector)
property rank: property rank:
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self): def __get__(self):
return self.c.id return self.c.id
def __set__(self, value): def __set__(self, value):
self.c.id = value self.c.id = value
property sentiment: property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self): def __get__(self):
return self.c.sentiment return self.c.sentiment
def __set__(self, float sentiment): def __set__(self, float sentiment):
self.c.sentiment = sentiment self.c.sentiment = sentiment
property orth_: property orth_:
"""RETURNS (unicode): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]
property text: property text:
"""A unicode representation of the token text. """RETURNS (unicode): The original verbatim text of the lexeme."""
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self): def __get__(self):
return self.orth_ return self.orth_
property lower: property lower:
def __get__(self): return self.c.lower """RETURNS (unicode): Lowercase form of the lexeme."""
def __set__(self, attr_t x): self.c.lower = x def __get__(self):
return self.c.lower
def __set__(self, attr_t x):
self.c.lower = x
property norm: property norm:
def __get__(self): return self.c.norm """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
def __set__(self, attr_t x): self.c.norm = x lexeme text.
"""
def __get__(self):
return self.c.norm
def __set__(self, attr_t x):
self.c.norm = x
property shape: property shape:
def __get__(self): return self.c.shape """RETURNS (uint64): Transform of the word's string, to show
def __set__(self, attr_t x): self.c.shape = x orthographic features.
"""
def __get__(self):
return self.c.shape
def __set__(self, attr_t x):
self.c.shape = x
property prefix: property prefix:
def __get__(self): return self.c.prefix """RETURNS (uint64): Length-N substring from the start of the word.
def __set__(self, attr_t x): self.c.prefix = x Defaults to `N=1`.
"""
def __get__(self):
return self.c.prefix
def __set__(self, attr_t x):
self.c.prefix = x
property suffix: property suffix:
def __get__(self): return self.c.suffix """RETURNS (uint64): Length-N substring from the end of the word.
def __set__(self, attr_t x): self.c.suffix = x Defaults to `N=3`.
"""
def __get__(self):
return self.c.suffix
def __set__(self, attr_t x):
self.c.suffix = x
property cluster: property cluster:
def __get__(self): return self.c.cluster """RETURNS (int): Brown cluster ID."""
def __set__(self, attr_t x): self.c.cluster = x def __get__(self):
return self.c.cluster
def __set__(self, attr_t x):
self.c.cluster = x
property lang: property lang:
def __get__(self): return self.c.lang """RETURNS (uint64): Language of the parent vocabulary."""
def __set__(self, attr_t x): self.c.lang = x def __get__(self):
return self.c.lang
def __set__(self, attr_t x):
self.c.lang = x
property prob: property prob:
def __get__(self): return self.c.prob """RETURNS (float): Smoothed log probability estimate of the lexeme's
def __set__(self, float x): self.c.prob = x type."""
def __get__(self):
return self.c.prob
def __set__(self, float x):
self.c.prob = x
property lower_: property lower_:
def __get__(self): return self.vocab.strings[self.c.lower] """RETURNS (unicode): Lowercase form of the word."""
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x) def __get__(self):
return self.vocab.strings[self.c.lower]
def __set__(self, unicode x):
self.c.lower = self.vocab.strings.add(x)
property norm_: property norm_:
def __get__(self): return self.vocab.strings[self.c.norm] """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x) lexeme text.
"""
def __get__(self):
return self.vocab.strings[self.c.norm]
def __set__(self, unicode x):
self.c.norm = self.vocab.strings.add(x)
property shape_: property shape_:
def __get__(self): return self.vocab.strings[self.c.shape] """RETURNS (unicode): Transform of the word's string, to show
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x) orthographic features.
"""
def __get__(self):
return self.vocab.strings[self.c.shape]
def __set__(self, unicode x):
self.c.shape = self.vocab.strings.add(x)
property prefix_: property prefix_:
def __get__(self): return self.vocab.strings[self.c.prefix] """RETURNS (unicode): Length-N substring from the start of the word.
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x) Defaults to `N=1`.
"""
def __get__(self):
return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_: property suffix_:
def __get__(self): return self.vocab.strings[self.c.suffix] """RETURNS (unicode): Length-N substring from the end of the word.
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x) Defaults to `N=3`.
"""
def __get__(self):
return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x):
self.c.suffix = self.vocab.strings.add(x)
property lang_: property lang_:
def __get__(self): return self.vocab.strings[self.c.lang] """RETURNS (unicode): Language of the parent vocabulary."""
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x) def __get__(self):
return self.vocab.strings[self.c.lang]
def __set__(self, unicode x):
self.c.lang = self.vocab.strings.add(x)
property flags: property flags:
def __get__(self): return self.c.flags """RETURNS (uint64): Container of the lexeme's binary flags."""
def __set__(self, flags_t x): self.c.flags = x def __get__(self):
return self.c.flags
def __set__(self, flags_t x):
self.c.flags = x
property is_oov: property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, attr_t x):
Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop: property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) """RETURNS (bool): Whether the lexeme is a stop word."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha: property is_alpha:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) """RETURNS (bool): Whether the lexeme consists of alphanumeric
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) characters. Equivalent to `lexeme.text.isalpha()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii: property is_ascii:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) """RETURNS (bool): Whether the lexeme consists of ASCII characters.
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit: property is_digit:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT) """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x) to `lexeme.text.isdigit()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower: property is_lower:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER) """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x) `lexeme.text.islower()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper:
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title: property is_title:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE) """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x) `lexeme.text.istitle()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct: property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) """RETURNS (bool): Whether the lexeme is punctuation."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space: property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) """RETURNS (bool): Whether the lexeme consist of whitespace characters.
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) Equivalent to `lexeme.text.isspace()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket: property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) """RETURNS (bool): Whether the lexeme is a bracket."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote: property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) """RETURNS (bool): Whether the lexeme is a quotation mark."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct: property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) """RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct: property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url: property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) """RETURNS (bool): Whether the lexeme resembles a URL."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num: property like_num:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) "10", "ten", etc.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email: property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL) """RETURNS (bool): Whether the lexeme resembles an email address."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)

View File

@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code vocab] +cell #[code vocab]
+cell #[code Vocab] +cell #[code Vocab]
+cell +cell The lexeme's vocabulary.
+row +row
+cell #[code text] +cell #[code text]
+cell unicode +cell unicode
+cell Verbatim text content. +cell Verbatim text content.
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Lexeme.text]). Existst
| mostly for consistency with the other attributes.
+row +row
+cell #[code lex_id] +cell #[code lex_id]
+cell int +cell int
+cell ID of the lexeme's lexical type. +cell ID of the lexeme's lexical type.
+row
+cell #[code rank]
+cell int
+cell
| Sequential ID of the lexemes's lexical type, used to index into
| tables, e.g. for word vectors.
+row
+cell #[code flags]
+cell int
+cell Container of the lexeme's binary flags.
+row
+cell #[code norm]
+cell int
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+row
+cell #[code norm_]
+cell unicode
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+row +row
+cell #[code lower] +cell #[code lower]
+cell int +cell int
+cell Lower-case form of the word. +cell Lowercase form of the word.
+row +row
+cell #[code lower_] +cell #[code lower_]
+cell unicode +cell unicode
+cell Lower-case form of the word. +cell Lowercase form of the word.
+row +row
+cell #[code shape] +cell #[code shape]
@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code prefix] +cell #[code prefix]
+cell int +cell int
+cell Length-N substring from the start of the word. Defaults to #[code N=1]. +cell
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+row +row
+cell #[code prefix_] +cell #[code prefix_]
+cell unicode +cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=1]. +cell
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+row +row
+cell #[code suffix] +cell #[code suffix]
+cell int +cell int
+cell Length-N substring from the end of the word. Defaults to #[code N=3]. +cell
| Length-N substring from the end of the word. Defaults to
| #[code N=3].
+row +row
+cell #[code suffix_] +cell #[code suffix_]
+cell unicode +cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=3]. +cell
| Length-N substring from the start of the word. Defaults to
| #[code N=3].
+row +row
+cell #[code is_alpha] +cell #[code is_alpha]
@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation.
| Is the lexeme in lowercase? Equivalent to | Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()]. | #[code lexeme.text.islower()].
+row
+cell #[code is_upper]
+cell bool
+cell
| Is the lexeme in uppercase? Equivalent to
| #[code lexeme.text.isupper()].
+row +row
+cell #[code is_title] +cell #[code is_title]
+cell bool +cell bool
@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation.
+cell bool +cell bool
+cell Is the lexeme punctuation? +cell Is the lexeme punctuation?
+row
+cell #[code is_left_punct]
+cell bool
+cell Is the lexeme a left punctuation mark, e.g. #[code (]?
+row
+cell #[code is_right_punct]
+cell bool
+cell Is the lexeme a right punctuation mark, e.g. #[code )]?
+row +row
+cell #[code is_space] +cell #[code is_space]
+cell bool +cell bool
@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation.
| Does the lexeme consist of whitespace characters? Equivalent to | Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()]. | #[code lexeme.text.isspace()].
+row
+cell #[code is_bracket]
+cell bool
+cell Is the lexeme a bracket?
+row
+cell #[code is_quote]
+cell bool
+cell Is the lexeme a quotation mark?
+row +row
+cell #[code like_url] +cell #[code like_url]
+cell bool +cell bool
@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation.
+cell #[code lang] +cell #[code lang]
+cell int +cell int
+cell Language of the parent vocabulary. +cell Language of the parent vocabulary.
+row +row
+cell #[code lang_] +cell #[code lang_]
+cell unicode +cell unicode
@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code prob] +cell #[code prob]
+cell float +cell float
+cell Smoothed log probability estimate of lexeme's type. +cell Smoothed log probability estimate of the lexeme's type.
+row
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+row +row
+cell #[code sentiment] +cell #[code sentiment]
+cell float +cell float
+cell A scalar value indicating the positivity or negativity of the lexeme. +cell
| A scalar value indicating the positivity or negativity of the
| lexeme.

View File

@ -801,7 +801,7 @@ p The L2 norm of the token's vector representation.
+cell int +cell int
+cell +cell
| Sequential ID of the token's lexical type, used to index into | Sequential ID of the token's lexical type, used to index into
| tagles, e.g. for word vectors. | tables, e.g. for word vectors.
+row +row
+cell #[code cluster] +cell #[code cluster]