mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Rename Tokens to Doc
This commit is contained in:
parent
d0fc7f5ba9
commit
bb522496dd
|
@ -9,7 +9,6 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||||
from preshed.maps cimport PreshMapArray
|
from preshed.maps cimport PreshMapArray
|
||||||
|
|
||||||
from .typedefs cimport hash_t, id_t
|
from .typedefs cimport hash_t, id_t
|
||||||
from .tokens cimport Tokens
|
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max(const weight_t* scores, const int n_classes) nogil
|
cdef int arg_max(const weight_t* scores, const int n_classes) nogil
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ..syntax.arc_eager import ArcEager
|
||||||
from ..syntax.ner import BiluoPushDown
|
from ..syntax.ner import BiluoPushDown
|
||||||
from ..syntax.parser import ParserFactory
|
from ..syntax.parser import ParserFactory
|
||||||
|
|
||||||
from ..tokens import Tokens
|
from ..tokens import Doc
|
||||||
from ..multi_words import RegexMerger
|
from ..multi_words import RegexMerger
|
||||||
|
|
||||||
from .pos import EnPosTagger
|
from .pos import EnPosTagger
|
||||||
|
|
|
@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
|
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
|
||||||
from ..typedefs cimport id_t
|
from ..typedefs cimport id_t
|
||||||
from ..structs cimport TokenC, Morphology, LexemeC
|
from ..structs cimport TokenC, Morphology, LexemeC
|
||||||
from ..tokens cimport Tokens
|
from ..tokens cimport Doc
|
||||||
from ..morphology cimport set_morph_from_dict
|
from ..morphology cimport set_morph_from_dict
|
||||||
from .._ml cimport arg_max
|
from .._ml cimport arg_max
|
||||||
|
|
||||||
|
@ -260,11 +260,11 @@ cdef class EnPosTagger:
|
||||||
'morphs.json'))))
|
'morphs.json'))))
|
||||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Tokens object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tokens (Tokens): The tokens to be tagged.
|
tokens (Doc): The tokens to be tagged.
|
||||||
"""
|
"""
|
||||||
if tokens.length == 0:
|
if tokens.length == 0:
|
||||||
return 0
|
return 0
|
||||||
|
@ -282,7 +282,7 @@ cdef class EnPosTagger:
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def tag_from_strings(self, Tokens tokens, object tag_strs):
|
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
tokens.data[i].tag = self.strings[tag_strs[i]]
|
tokens.data[i].tag = self.strings[tag_strs[i]]
|
||||||
|
@ -291,7 +291,7 @@ cdef class EnPosTagger:
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def train(self, Tokens tokens, object gold_tag_strs):
|
def train(self, Doc tokens, object gold_tag_strs):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int loss
|
cdef int loss
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Doc
|
||||||
from .typedefs cimport flags_t, attr_id_t, attr_t
|
from .typedefs cimport flags_t, attr_id_t, attr_t
|
||||||
from .parts_of_speech cimport univ_pos_t
|
from .parts_of_speech cimport univ_pos_t
|
||||||
from .structs cimport Morphology, TokenC, LexemeC
|
from .structs cimport Morphology, TokenC, LexemeC
|
||||||
|
@ -7,7 +7,7 @@ from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
cdef readonly Tokens _seq
|
cdef readonly Doc _seq
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int start
|
cdef public int start
|
||||||
cdef public int end
|
cdef public int end
|
||||||
|
|
|
@ -3,8 +3,8 @@ from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Tokens object."""
|
"""A slice from a Doc object."""
|
||||||
def __cinit__(self, Tokens tokens, int start, int end, int label=0):
|
def __cinit__(self, Doc tokens, int start, int end, int label=0):
|
||||||
self._seq = tokens
|
self._seq = tokens
|
||||||
self.start = start
|
self.start = start
|
||||||
self.end = end
|
self.end = end
|
||||||
|
|
|
@ -4,7 +4,7 @@ from .._ml cimport Model
|
||||||
|
|
||||||
from .arc_eager cimport TransitionSystem
|
from .arc_eager cimport TransitionSystem
|
||||||
|
|
||||||
from ..tokens cimport Tokens, TokenC
|
from ..tokens cimport Doc, TokenC
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
|
@ -12,5 +12,5 @@ cdef class Parser:
|
||||||
cdef readonly Model model
|
cdef readonly Model model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
|
|
||||||
cdef int _greedy_parse(self, Tokens tokens) except -1
|
cdef int _greedy_parse(self, Doc tokens) except -1
|
||||||
cdef int _beam_parse(self, Tokens tokens) except -1
|
cdef int _beam_parse(self, Doc tokens) except -1
|
||||||
|
|
|
@ -31,7 +31,7 @@ from thinc.learner cimport LinearModel
|
||||||
from thinc.search cimport Beam
|
from thinc.search cimport Beam
|
||||||
from thinc.search cimport MaxViolation
|
from thinc.search cimport MaxViolation
|
||||||
|
|
||||||
from ..tokens cimport Tokens, TokenC
|
from ..tokens cimport Doc, TokenC
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,20 +75,20 @@ cdef class Parser:
|
||||||
templates = get_templates(self.cfg.features)
|
templates = get_templates(self.cfg.features)
|
||||||
self.model = Model(self.moves.n_moves, templates, model_dir)
|
self.model = Model(self.moves.n_moves, templates, model_dir)
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Doc tokens):
|
||||||
if self.cfg.get('beam_width', 1) < 1:
|
if self.cfg.get('beam_width', 1) < 1:
|
||||||
self._greedy_parse(tokens)
|
self._greedy_parse(tokens)
|
||||||
else:
|
else:
|
||||||
self._beam_parse(tokens)
|
self._beam_parse(tokens)
|
||||||
|
|
||||||
def train(self, Tokens tokens, GoldParse gold):
|
def train(self, Doc tokens, GoldParse gold):
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
if self.cfg.beam_width < 1:
|
if self.cfg.beam_width < 1:
|
||||||
return self._greedy_train(tokens, gold)
|
return self._greedy_train(tokens, gold)
|
||||||
else:
|
else:
|
||||||
return self._beam_train(tokens, gold)
|
return self._beam_train(tokens, gold)
|
||||||
|
|
||||||
cdef int _greedy_parse(self, Tokens tokens) except -1:
|
cdef int _greedy_parse(self, Doc tokens) except -1:
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef int n_feats
|
cdef int n_feats
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
@ -106,7 +106,7 @@ cdef class Parser:
|
||||||
self.moves.finalize_state(stcls)
|
self.moves.finalize_state(stcls)
|
||||||
tokens.set_parse(stcls._sent)
|
tokens.set_parse(stcls._sent)
|
||||||
|
|
||||||
cdef int _beam_parse(self, Tokens tokens) except -1:
|
cdef int _beam_parse(self, Doc tokens) except -1:
|
||||||
cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
|
cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
|
||||||
words = [w.orth_ for w in tokens]
|
words = [w.orth_ for w in tokens]
|
||||||
beam.initialize(_init_state, tokens.length, tokens.data)
|
beam.initialize(_init_state, tokens.length, tokens.data)
|
||||||
|
@ -118,7 +118,7 @@ cdef class Parser:
|
||||||
tokens.set_parse(state._sent)
|
tokens.set_parse(state._sent)
|
||||||
_cleanup(beam)
|
_cleanup(beam)
|
||||||
|
|
||||||
def _greedy_train(self, Tokens tokens, GoldParse gold):
|
def _greedy_train(self, Doc tokens, GoldParse gold):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||||
self.moves.initialize_state(stcls)
|
self.moves.initialize_state(stcls)
|
||||||
|
@ -143,7 +143,7 @@ cdef class Parser:
|
||||||
loss += cost
|
loss += cost
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def _beam_train(self, Tokens tokens, GoldParse gold_parse):
|
def _beam_train(self, Doc tokens, GoldParse gold_parse):
|
||||||
cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
|
cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
|
||||||
pred.initialize(_init_state, tokens.length, tokens.data)
|
pred.initialize(_init_state, tokens.length, tokens.data)
|
||||||
pred.check_done(_check_final_state, NULL)
|
pred.check_done(_check_final_state, NULL)
|
||||||
|
@ -190,7 +190,7 @@ cdef class Parser:
|
||||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
|
||||||
def _count_feats(self, dict counts, Tokens tokens, list hist, int inc):
|
def _count_feats(self, dict counts, Doc tokens, list hist, int inc):
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||||
|
|
|
@ -8,7 +8,7 @@ from cymem.cymem cimport Pool
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport LexemeC, TokenC, Morphology, UniStr
|
from .structs cimport LexemeC, TokenC, Morphology, UniStr
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Doc
|
||||||
from .vocab cimport Vocab, _Cached
|
from .vocab cimport Vocab, _Cached
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,13 +27,13 @@ cdef class Tokenizer:
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings)
|
cpdef Doc tokens_from_list(self, list strings)
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
|
||||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except NULL
|
vector[LexemeC*] *suffixes) except NULL
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
|
||||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -38,9 +38,9 @@ cdef class Tokenizer:
|
||||||
infix_re = re.compile(infix_re)
|
infix_re = re.compile(infix_re)
|
||||||
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Doc tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
cdef Tokens tokens = Tokens(self.vocab, ' '.join(strings))
|
cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef UniStr string_struct
|
cdef UniStr string_struct
|
||||||
|
@ -70,10 +70,10 @@ cdef class Tokenizer:
|
||||||
string (unicode): The string to be tokenized.
|
string (unicode): The string to be tokenized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
||||||
"""
|
"""
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Tokens tokens = Tokens(self.vocab, string)
|
cdef Doc tokens = Doc(self.vocab, string)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -101,7 +101,7 @@ cdef class Tokenizer:
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cached = <_Cached*>self._cache.get(key)
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
return False
|
return False
|
||||||
|
@ -114,7 +114,7 @@ cdef class Tokenizer:
|
||||||
idx = tokens.push_back(idx, &cached.data.tokens[i])
|
idx = tokens.push_back(idx, &cached.data.tokens[i])
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef hash_t orig_key
|
cdef hash_t orig_key
|
||||||
|
@ -167,7 +167,7 @@ cdef class Tokenizer:
|
||||||
break
|
break
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes) except -1:
|
vector[const LexemeC*] *suffixes) except -1:
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
|
|
|
@ -28,13 +28,12 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
return lexeme.flags & (1 << flag_id)
|
return lexeme.flags & (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Doc:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef Vocab vocab
|
cdef Vocab vocab
|
||||||
|
|
||||||
cdef TokenC* data
|
cdef TokenC* data
|
||||||
|
|
||||||
|
|
||||||
cdef list _py_tokens
|
cdef list _py_tokens
|
||||||
cdef unicode _string
|
cdef unicode _string
|
||||||
cdef tuple _tag_strings
|
cdef tuple _tag_strings
|
||||||
|
@ -62,12 +61,12 @@ cdef class Token:
|
||||||
cdef bint _owns_c_data
|
cdef bint _owns_c_data
|
||||||
|
|
||||||
|
|
||||||
cdef Tokens _seq
|
cdef Doc _seq
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline Token cinit(Vocab vocab, unicode string,
|
cdef inline Token cinit(Vocab vocab, unicode string,
|
||||||
const TokenC* token, int offset, int array_len,
|
const TokenC* token, int offset, int array_len,
|
||||||
Tokens parent_seq):
|
Doc parent_seq):
|
||||||
if offset < 0 or offset >= array_len:
|
if offset < 0 or offset >= array_len:
|
||||||
|
|
||||||
msg = "Attempt to access token at %d, max length %d"
|
msg = "Attempt to access token at %d, max length %d"
|
||||||
|
|
|
@ -78,7 +78,7 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Doc:
|
||||||
"""
|
"""
|
||||||
Container class for annotated text. Constructed via English.__call__ or
|
Container class for annotated text. Constructed via English.__call__ or
|
||||||
Tokenizer.__call__.
|
Tokenizer.__call__.
|
||||||
|
@ -185,7 +185,7 @@ cdef class Tokens:
|
||||||
Yield a list of sentence Span objects, calculated from the dependency parse.
|
Yield a list of sentence Span objects, calculated from the dependency parse.
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
|
||||||
start = 0
|
start = 0
|
||||||
for i in range(1, self.length):
|
for i in range(1, self.length):
|
||||||
if self.data[i].sent_start:
|
if self.data[i].sent_start:
|
||||||
|
@ -370,7 +370,7 @@ cdef class Tokens:
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
via Tokens.__getitem__ and Tokens.__iter__.
|
via Doc.__getitem__ and Doc.__iter__.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, Vocab vocab, unicode string):
|
def __cinit__(self, Vocab vocab, unicode string):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
|
Loading…
Reference in New Issue
Block a user