mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
* Upd docstrings
This commit is contained in:
parent
91a5064b7f
commit
bb80937544
|
@ -16,16 +16,64 @@ def get_lex_props(string):
|
||||||
|
|
||||||
|
|
||||||
class English(object):
|
class English(object):
|
||||||
def __init__(self, data_dir=None, tag=True, parse=False):
|
"""The English NLP pipeline.
|
||||||
|
|
||||||
|
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
|
||||||
|
|
||||||
|
Keyword args:
|
||||||
|
data_dir (unicode): A path to a directory, from which to load the pipeline.
|
||||||
|
If None, looks for a directory named "data/" in the same directory as
|
||||||
|
the present file, i.e. path.join(path.dirname(__file__, 'data')).
|
||||||
|
If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
|
||||||
|
If path.join(data_dir, 'deps') exists, the parser is loaded from it.
|
||||||
|
See Pipeline Directory Structure for details.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
vocab (spacy.vocab.Vocab): The lexicon.
|
||||||
|
|
||||||
|
strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.
|
||||||
|
|
||||||
|
tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.
|
||||||
|
|
||||||
|
tagger (spacy.en.pos.EnPosTagger):
|
||||||
|
The part-of-speech tagger, which also performs lemmatization and
|
||||||
|
morphological analysis.
|
||||||
|
|
||||||
|
parser (spacy.syntax.parser.GreedyParser):
|
||||||
|
A greedy shift-reduce dependency parser.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, data_dir=None):
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
data_dir = path.join(path.dirname(__file__), 'data')
|
data_dir = path.join(path.dirname(__file__), 'data')
|
||||||
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
|
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
|
||||||
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
||||||
self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
|
if path.exists(path.join(data_dir, 'pos')):
|
||||||
self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None
|
self.tagger = EnPosTagger(self.vocab.strings, data_dir)
|
||||||
|
else:
|
||||||
|
self.tagger = None
|
||||||
|
if path.exists(path.join(data_dir, 'deps')):
|
||||||
|
self.parser = GreedyParser(path.join(data_dir, 'deps'))
|
||||||
|
else:
|
||||||
|
self.parser = None
|
||||||
self.strings = self.vocab.strings
|
self.strings = self.vocab.strings
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=True):
|
def __call__(self, text, tag=True, parse=True):
|
||||||
|
"""Apply the pipeline to some text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (unicode): The text to be processed.
|
||||||
|
|
||||||
|
Keyword args:
|
||||||
|
tag (bool): Whether to add part-of-speech tags to the text. This
|
||||||
|
will also set morphological analysis and lemmas.
|
||||||
|
|
||||||
|
parse (bool): Whether to add dependency-heads and labels to the text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tokens (spacy.tokens.Tokens):
|
||||||
|
"""
|
||||||
tokens = self.tokenizer.tokenize(text)
|
tokens = self.tokenizer.tokenize(text)
|
||||||
if self.tagger and tag:
|
if self.tagger and tag:
|
||||||
self.tagger(tokens)
|
self.tagger(tokens)
|
||||||
|
@ -35,7 +83,10 @@ class English(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tags(self):
|
def tags(self):
|
||||||
|
"""List of part-of-speech tag names."""
|
||||||
if self.tagger is None:
|
if self.tagger is None:
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
return self.tagger.tag_names
|
return self.tagger.tag_names
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -204,6 +204,7 @@ cdef struct _CachedMorph:
|
||||||
|
|
||||||
|
|
||||||
cdef class EnPosTagger(Tagger):
|
cdef class EnPosTagger(Tagger):
|
||||||
|
"""A part-of-speech tagger for English"""
|
||||||
def __init__(self, StringStore strings, data_dir):
|
def __init__(self, StringStore strings, data_dir):
|
||||||
model_dir = path.join(data_dir, 'pos')
|
model_dir = path.join(data_dir, 'pos')
|
||||||
Tagger.__init__(self, path.join(model_dir))
|
Tagger.__init__(self, path.join(model_dir))
|
||||||
|
@ -224,6 +225,11 @@ cdef class EnPosTagger(Tagger):
|
||||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Tokens tokens):
|
||||||
|
"""Apply the tagger, setting the POS tags onto the Tokens object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tokens (Tokens): The tokens to be tagged.
|
||||||
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
cdef TokenC* t = tokens.data
|
cdef TokenC* t = tokens.data
|
||||||
|
|
|
@ -13,7 +13,8 @@ from itertools import combinations
|
||||||
from ..tokens cimport TokenC
|
from ..tokens cimport TokenC
|
||||||
from ._state cimport State
|
from ._state cimport State
|
||||||
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
|
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
|
||||||
from ._state cimport get_left, get_right
|
from ._state cimport has_head, get_left, get_right
|
||||||
|
from ._state cimport count_left_kids, count_right_kids
|
||||||
|
|
||||||
|
|
||||||
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
|
@ -24,10 +25,12 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
context[3] = 0
|
context[3] = 0
|
||||||
context[4] = 0
|
context[4] = 0
|
||||||
context[5] = 0
|
context[5] = 0
|
||||||
|
context[6] = 0
|
||||||
else:
|
else:
|
||||||
context[0] = token.lex.sic
|
context[0] = token.lex.sic
|
||||||
context[1] = token.pos
|
context[1] = token.lemma
|
||||||
context[2] = token.lex.cluster
|
context[2] = token.fine_pos
|
||||||
|
context[3] = token.lex.cluster
|
||||||
# We've read in the string little-endian, so now we can take & (2**n)-1
|
# We've read in the string little-endian, so now we can take & (2**n)-1
|
||||||
# to get the first n bits of the cluster.
|
# to get the first n bits of the cluster.
|
||||||
# e.g. s = "1110010101"
|
# e.g. s = "1110010101"
|
||||||
|
@ -40,9 +43,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
# What we're doing here is picking a number where all bits are 1, e.g.
|
# What we're doing here is picking a number where all bits are 1, e.g.
|
||||||
# 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
|
# 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
|
||||||
# the source that are set to 1.
|
# the source that are set to 1.
|
||||||
context[3] = token.lex.cluster & 63
|
context[4] = token.lex.cluster & 63
|
||||||
context[4] = token.lex.cluster & 15
|
context[5] = token.lex.cluster & 15
|
||||||
context[5] = token.dep_tag
|
context[6] = token.dep_tag if has_head(token) else 0
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* state) except -1:
|
cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
|
@ -66,11 +69,147 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
context[dist] = state.stack[0] - state.i
|
context[dist] = state.stack[0] - state.i
|
||||||
else:
|
else:
|
||||||
context[dist] = 0
|
context[dist] = 0
|
||||||
context[N0lv] = 0
|
context[N0lv] = max(count_left_kids(get_n0(state)), 5)
|
||||||
context[S0lv] = 0
|
context[S0lv] = max(count_left_kids(get_s0(state)), 5)
|
||||||
context[S0rv] = 0
|
context[S0rv] = max(count_right_kids(get_s0(state)), 5)
|
||||||
context[S1lv] = 0
|
context[S1lv] = max(count_left_kids(get_s1(state)), 5)
|
||||||
context[S1rv] = 0
|
context[S1rv] = max(count_right_kids(get_s1(state)), 5)
|
||||||
|
|
||||||
|
context[S0_has_head] = 0
|
||||||
|
context[S1_has_head] = 0
|
||||||
|
context[S2_has_head] = 0
|
||||||
|
if state.stack_len >= 1:
|
||||||
|
context[S0_has_head] = has_head(get_s0(state)) + 1
|
||||||
|
if state.stack_len >= 2:
|
||||||
|
context[S1_has_head] = has_head(get_s1(state)) + 1
|
||||||
|
if state.stack_len >= 3:
|
||||||
|
context[S2_has_head] = has_head(get_s2(state))
|
||||||
|
|
||||||
|
|
||||||
|
unigrams = (
|
||||||
|
(S2W, S2p),
|
||||||
|
(S2c6, S2p),
|
||||||
|
|
||||||
|
(S1W, S1p),
|
||||||
|
(S1c6, S1p),
|
||||||
|
|
||||||
|
(S0W, S0p),
|
||||||
|
(S0c6, S0p),
|
||||||
|
|
||||||
|
(N0W, N0p),
|
||||||
|
(N0p,),
|
||||||
|
(N0c,),
|
||||||
|
(N0c6, N0p),
|
||||||
|
(N0L,),
|
||||||
|
|
||||||
|
(N1W, N1p),
|
||||||
|
(N1c6, N1p),
|
||||||
|
|
||||||
|
(N2W, N2p),
|
||||||
|
(N2c6, N2p),
|
||||||
|
|
||||||
|
(S0r2W, S0r2p),
|
||||||
|
(S0r2c6, S0r2p),
|
||||||
|
(S0r2L,),
|
||||||
|
|
||||||
|
(S0rW, S0rp),
|
||||||
|
(S0rc6, S0rp),
|
||||||
|
(S0rL,),
|
||||||
|
|
||||||
|
(S0l2W, S0l2p),
|
||||||
|
(S0l2c6, S0l2p),
|
||||||
|
(S0l2L,),
|
||||||
|
|
||||||
|
(S0lW, S0lp),
|
||||||
|
(S0lc6, S0lp),
|
||||||
|
(S0lL,),
|
||||||
|
|
||||||
|
(N0l2W, N0l2p),
|
||||||
|
(N0l2c6, N0l2p),
|
||||||
|
(N0l2L,),
|
||||||
|
|
||||||
|
(N0lW, N0lp),
|
||||||
|
(N0lc6, N0lp),
|
||||||
|
(N0lL,),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
s0_n0 = (
|
||||||
|
(S0W, S0p, N0W, N0p),
|
||||||
|
(S0c, S0p, N0c, N0p),
|
||||||
|
(S0c6, S0p, N0c6, N0p),
|
||||||
|
(S0c4, S0p, N0c4, N0p),
|
||||||
|
(S0p, N0p),
|
||||||
|
(S0W, N0p),
|
||||||
|
(S0p, N0W),
|
||||||
|
(S0W, N0c),
|
||||||
|
(S0c, N0W),
|
||||||
|
(S0p, N0c),
|
||||||
|
(S0c, N0p),
|
||||||
|
(S0W, S0rp, N0p),
|
||||||
|
(S0p, S0rp, N0p),
|
||||||
|
(S0p, N0lp, N0W),
|
||||||
|
(S0p, N0lp, N0p),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
s1_n0 = (
|
||||||
|
(S1p, N0p),
|
||||||
|
(S1c, N0c),
|
||||||
|
(S1c, N0p),
|
||||||
|
(S1p, N0c),
|
||||||
|
(S1W, S1p, N0p),
|
||||||
|
(S1p, N0W, N0p),
|
||||||
|
(S1c6, S1p, N0c6, N0p),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
s0_n1 = (
|
||||||
|
(S0p, N1p),
|
||||||
|
(S0c, N1c),
|
||||||
|
(S0c, N1p),
|
||||||
|
(S0p, N1c),
|
||||||
|
(S0W, S0p, N1p),
|
||||||
|
(S0p, N1W, N1p),
|
||||||
|
(S0c6, S0p, N1c6, N1p),
|
||||||
|
)
|
||||||
|
|
||||||
|
n0_n1 = (
|
||||||
|
(N0W, N0p, N1W, N1p),
|
||||||
|
(N0W, N0p, N1p),
|
||||||
|
(N0p, N1W, N1p),
|
||||||
|
(N0c, N0p, N1c, N1p),
|
||||||
|
(N0c6, N0p, N1c6, N1p),
|
||||||
|
(N0c, N1c),
|
||||||
|
(N0p, N1c),
|
||||||
|
)
|
||||||
|
|
||||||
|
tree_shape = (
|
||||||
|
(dist,),
|
||||||
|
(S0p, S0_has_head, S1_has_head, S2_has_head),
|
||||||
|
(S0p, S0lv, S0rv),
|
||||||
|
(N0p, N0lv),
|
||||||
|
)
|
||||||
|
|
||||||
|
trigrams = (
|
||||||
|
(N0p, N1p, N2p),
|
||||||
|
(S0p, S0lp, S0l2p),
|
||||||
|
(S0p, S0rp, S0r2p),
|
||||||
|
(S0p, S1p, S2p),
|
||||||
|
(S1p, S0p, N0p),
|
||||||
|
(S0p, S0lp, N0p),
|
||||||
|
(S0p, N0p, N0lp),
|
||||||
|
(N0p, N0lp, N0l2p),
|
||||||
|
|
||||||
|
(S0W, S0p, S0rL, S0r2L),
|
||||||
|
(S0p, S0rL, S0r2L),
|
||||||
|
|
||||||
|
(S0W, S0p, S0lL, S0l2L),
|
||||||
|
(S0p, S0lL, S0l2L),
|
||||||
|
|
||||||
|
(N0W, N0p, N0lL, N0l2L),
|
||||||
|
(N0p, N0lL, N0l2L),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
arc_eager = (
|
arc_eager = (
|
||||||
|
@ -86,7 +225,6 @@ arc_eager = (
|
||||||
(N2w, N2p),
|
(N2w, N2p),
|
||||||
(N2w,),
|
(N2w,),
|
||||||
(N2p,),
|
(N2p,),
|
||||||
|
|
||||||
(S0w, S0p, N0w, N0p),
|
(S0w, S0p, N0w, N0p),
|
||||||
(S0w, S0p, N0w),
|
(S0w, S0p, N0w),
|
||||||
(S0w, N0w, N0p),
|
(S0w, N0w, N0p),
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from preshed.counter cimport PreshCounter
|
from preshed.counter cimport PreshCounter
|
||||||
|
|
||||||
|
@ -59,13 +60,7 @@ cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
"""A sequence of references to Lexeme objects.
|
"""Access and set annotations onto some text.
|
||||||
|
|
||||||
The Tokens class provides fast and memory-efficient access to lexical features,
|
|
||||||
and can efficiently export the data to a numpy array.
|
|
||||||
|
|
||||||
>>> from spacy.en import EN
|
|
||||||
>>> tokens = EN.tokenize('An example sentence.')
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, string_length=0):
|
def __init__(self, Vocab vocab, string_length=0):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -86,10 +81,20 @@ cdef class Tokens:
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
|
"""Retrieve a token.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
token (Token):
|
||||||
|
"""
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self, i)
|
return Token(self, i)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
"""Iterate over the tokens.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
token (Token):
|
||||||
|
"""
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
yield self[i]
|
yield self[i]
|
||||||
|
|
||||||
|
@ -148,6 +153,11 @@ cdef class Tokens:
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
|
"""An individual token.
|
||||||
|
|
||||||
|
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
|
||||||
|
object.
|
||||||
|
"""
|
||||||
def __init__(self, Tokens tokens, int i):
|
def __init__(self, Tokens tokens, int i):
|
||||||
self._seq = tokens
|
self._seq = tokens
|
||||||
self.i = i
|
self.i = i
|
||||||
|
@ -163,21 +173,44 @@ cdef class Token:
|
||||||
return self.string + ' '
|
return self.string + ' '
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
"""The number of unicode code-points in the original string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
length (int):
|
||||||
|
"""
|
||||||
return self._seq.data[self.i].lex.length
|
return self._seq.data[self.i].lex.length
|
||||||
|
|
||||||
property idx:
|
property idx:
|
||||||
|
"""The index into the original string at which the token starts.
|
||||||
|
|
||||||
|
The following is supposed to always be true:
|
||||||
|
|
||||||
|
>>> original_string[token.idx:token.idx len(token) == token.string
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].idx
|
return self._seq.data[self.i].idx
|
||||||
|
|
||||||
property length:
|
|
||||||
def __get__(self):
|
|
||||||
return self._seq.data[self.i].lex.length
|
|
||||||
|
|
||||||
property cluster:
|
property cluster:
|
||||||
|
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||||
|
|
||||||
|
Similar words have better-than-chance likelihood of having similar cluster
|
||||||
|
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||||
|
and help to make models slightly more robust to domain variation.
|
||||||
|
|
||||||
|
A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||||
|
as the more general part of the hierarchical clustering is often more accurate
|
||||||
|
than the lower categories.
|
||||||
|
|
||||||
|
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||||
|
bit-mask:
|
||||||
|
|
||||||
|
>>> six_bits = cluster & (2**6 - 1)
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].lex.cluster
|
return self._seq.data[self.i].lex.cluster
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
|
"""The unicode string of the word, with no whitespace padding."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lex.sic == 0:
|
if t.lex.sic == 0:
|
||||||
|
@ -186,6 +219,9 @@ cdef class Token:
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
property lemma:
|
property lemma:
|
||||||
|
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
||||||
|
assigned, the most common part-of-speech tag of the word is used.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lemma == 0:
|
if t.lemma == 0:
|
||||||
|
@ -193,15 +229,27 @@ cdef class Token:
|
||||||
cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
property dep:
|
property dep_tag:
|
||||||
|
"""The ID integer of the word's dependency label. If no parse has been
|
||||||
|
assigned, defaults to 0.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].dep_tag
|
return self._seq.data[self.i].dep_tag
|
||||||
|
|
||||||
property pos:
|
property pos:
|
||||||
|
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||||
|
Google Universal Tag Set. Constants for this tag set are available in
|
||||||
|
spacy.typedefs.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].pos
|
return self._seq.data[self.i].pos
|
||||||
|
|
||||||
property fine_pos:
|
property fine_pos:
|
||||||
|
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||||
|
by the tagger model. Fine-grained tags include morphological information,
|
||||||
|
and other distinctions, and allow a more accurate tagger to be trained.
|
||||||
|
"""
|
||||||
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].fine_pos
|
return self._seq.data[self.i].fine_pos
|
||||||
|
|
||||||
|
@ -210,6 +258,7 @@ cdef class Token:
|
||||||
return self._seq.data[self.i].lex.sic
|
return self._seq.data[self.i].lex.sic
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
return Token(self._seq, self.i + t.head)
|
return Token(self._seq, self.i + t.head)
|
||||||
|
|
|
@ -33,8 +33,6 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's Lexeme structs.
|
'''A map container for a language's Lexeme structs.
|
||||||
|
|
||||||
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
|
||||||
'''
|
'''
|
||||||
def __init__(self, data_dir=None, get_lex_props=None):
|
def __init__(self, data_dir=None, get_lex_props=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -53,6 +51,7 @@ cdef class Vocab:
|
||||||
self.load(path.join(data_dir, 'lexemes'))
|
self.load(path.join(data_dir, 'lexemes'))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
"""The current number of lexemes stored."""
|
||||||
return self.lexemes.size()
|
return self.lexemes.size()
|
||||||
|
|
||||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user