mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Rename Tokens to Doc
This commit is contained in:
parent
d0fc7f5ba9
commit
bb522496dd
|
@ -9,7 +9,6 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
|||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from .typedefs cimport hash_t, id_t
|
||||
from .tokens cimport Tokens
|
||||
|
||||
|
||||
cdef int arg_max(const weight_t* scores, const int n_classes) nogil
|
||||
|
|
|
@ -9,7 +9,7 @@ from ..syntax.arc_eager import ArcEager
|
|||
from ..syntax.ner import BiluoPushDown
|
||||
from ..syntax.parser import ParserFactory
|
||||
|
||||
from ..tokens import Tokens
|
||||
from ..tokens import Doc
|
||||
from ..multi_words import RegexMerger
|
||||
|
||||
from .pos import EnPosTagger
|
||||
|
|
|
@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
|||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
|
||||
from ..typedefs cimport id_t
|
||||
from ..structs cimport TokenC, Morphology, LexemeC
|
||||
from ..tokens cimport Tokens
|
||||
from ..tokens cimport Doc
|
||||
from ..morphology cimport set_morph_from_dict
|
||||
from .._ml cimport arg_max
|
||||
|
||||
|
@ -260,11 +260,11 @@ cdef class EnPosTagger:
|
|||
'morphs.json'))))
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Tokens object.
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Args:
|
||||
tokens (Tokens): The tokens to be tagged.
|
||||
tokens (Doc): The tokens to be tagged.
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
|
@ -282,7 +282,7 @@ cdef class EnPosTagger:
|
|||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def tag_from_strings(self, Tokens tokens, object tag_strs):
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
tokens.data[i].tag = self.strings[tag_strs[i]]
|
||||
|
@ -291,7 +291,7 @@ cdef class EnPosTagger:
|
|||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def train(self, Tokens tokens, object gold_tag_strs):
|
||||
def train(self, Doc tokens, object gold_tag_strs):
|
||||
cdef int i
|
||||
cdef int loss
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .tokens cimport Tokens
|
||||
from .tokens cimport Doc
|
||||
from .typedefs cimport flags_t, attr_id_t, attr_t
|
||||
from .parts_of_speech cimport univ_pos_t
|
||||
from .structs cimport Morphology, TokenC, LexemeC
|
||||
|
@ -7,7 +7,7 @@ from .strings cimport StringStore
|
|||
|
||||
|
||||
cdef class Span:
|
||||
cdef readonly Tokens _seq
|
||||
cdef readonly Doc _seq
|
||||
cdef public int i
|
||||
cdef public int start
|
||||
cdef public int end
|
||||
|
|
|
@ -3,8 +3,8 @@ from collections import defaultdict
|
|||
|
||||
|
||||
cdef class Span:
|
||||
"""A slice from a Tokens object."""
|
||||
def __cinit__(self, Tokens tokens, int start, int end, int label=0):
|
||||
"""A slice from a Doc object."""
|
||||
def __cinit__(self, Doc tokens, int start, int end, int label=0):
|
||||
self._seq = tokens
|
||||
self.start = start
|
||||
self.end = end
|
||||
|
|
|
@ -4,7 +4,7 @@ from .._ml cimport Model
|
|||
|
||||
from .arc_eager cimport TransitionSystem
|
||||
|
||||
from ..tokens cimport Tokens, TokenC
|
||||
from ..tokens cimport Doc, TokenC
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
|
@ -12,5 +12,5 @@ cdef class Parser:
|
|||
cdef readonly Model model
|
||||
cdef readonly TransitionSystem moves
|
||||
|
||||
cdef int _greedy_parse(self, Tokens tokens) except -1
|
||||
cdef int _beam_parse(self, Tokens tokens) except -1
|
||||
cdef int _greedy_parse(self, Doc tokens) except -1
|
||||
cdef int _beam_parse(self, Doc tokens) except -1
|
||||
|
|
|
@ -31,7 +31,7 @@ from thinc.learner cimport LinearModel
|
|||
from thinc.search cimport Beam
|
||||
from thinc.search cimport MaxViolation
|
||||
|
||||
from ..tokens cimport Tokens, TokenC
|
||||
from ..tokens cimport Doc, TokenC
|
||||
from ..strings cimport StringStore
|
||||
|
||||
|
||||
|
@ -75,20 +75,20 @@ cdef class Parser:
|
|||
templates = get_templates(self.cfg.features)
|
||||
self.model = Model(self.moves.n_moves, templates, model_dir)
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
def __call__(self, Doc tokens):
|
||||
if self.cfg.get('beam_width', 1) < 1:
|
||||
self._greedy_parse(tokens)
|
||||
else:
|
||||
self._beam_parse(tokens)
|
||||
|
||||
def train(self, Tokens tokens, GoldParse gold):
|
||||
def train(self, Doc tokens, GoldParse gold):
|
||||
self.moves.preprocess_gold(gold)
|
||||
if self.cfg.beam_width < 1:
|
||||
return self._greedy_train(tokens, gold)
|
||||
else:
|
||||
return self._beam_train(tokens, gold)
|
||||
|
||||
cdef int _greedy_parse(self, Tokens tokens) except -1:
|
||||
cdef int _greedy_parse(self, Doc tokens) except -1:
|
||||
cdef atom_t[CONTEXT_SIZE] context
|
||||
cdef int n_feats
|
||||
cdef Pool mem = Pool()
|
||||
|
@ -106,7 +106,7 @@ cdef class Parser:
|
|||
self.moves.finalize_state(stcls)
|
||||
tokens.set_parse(stcls._sent)
|
||||
|
||||
cdef int _beam_parse(self, Tokens tokens) except -1:
|
||||
cdef int _beam_parse(self, Doc tokens) except -1:
|
||||
cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
|
||||
words = [w.orth_ for w in tokens]
|
||||
beam.initialize(_init_state, tokens.length, tokens.data)
|
||||
|
@ -118,7 +118,7 @@ cdef class Parser:
|
|||
tokens.set_parse(state._sent)
|
||||
_cleanup(beam)
|
||||
|
||||
def _greedy_train(self, Tokens tokens, GoldParse gold):
|
||||
def _greedy_train(self, Doc tokens, GoldParse gold):
|
||||
cdef Pool mem = Pool()
|
||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||
self.moves.initialize_state(stcls)
|
||||
|
@ -143,7 +143,7 @@ cdef class Parser:
|
|||
loss += cost
|
||||
return loss
|
||||
|
||||
def _beam_train(self, Tokens tokens, GoldParse gold_parse):
|
||||
def _beam_train(self, Doc tokens, GoldParse gold_parse):
|
||||
cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
|
||||
pred.initialize(_init_state, tokens.length, tokens.data)
|
||||
pred.check_done(_check_final_state, NULL)
|
||||
|
@ -190,7 +190,7 @@ cdef class Parser:
|
|||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
|
||||
def _count_feats(self, dict counts, Tokens tokens, list hist, int inc):
|
||||
def _count_feats(self, dict counts, Doc tokens, list hist, int inc):
|
||||
cdef atom_t[CONTEXT_SIZE] context
|
||||
cdef Pool mem = Pool()
|
||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||
|
|
|
@ -8,7 +8,7 @@ from cymem.cymem cimport Pool
|
|||
from .typedefs cimport hash_t
|
||||
from .structs cimport LexemeC, TokenC, Morphology, UniStr
|
||||
from .strings cimport StringStore
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport Doc
|
||||
from .vocab cimport Vocab, _Cached
|
||||
|
||||
|
||||
|
@ -27,13 +27,13 @@ cdef class Tokenizer:
|
|||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Doc tokens_from_list(self, list strings)
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
|
||||
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
|
||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
|
|
|
@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict
|
|||
|
||||
from . import util
|
||||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
from .tokens import Doc
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -38,9 +38,9 @@ cdef class Tokenizer:
|
|||
infix_re = re.compile(infix_re)
|
||||
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cpdef Doc tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.vocab, ' '.join(strings))
|
||||
cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef UniStr string_struct
|
||||
|
@ -70,10 +70,10 @@ cdef class Tokenizer:
|
|||
string (unicode): The string to be tokenized.
|
||||
|
||||
Returns:
|
||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
||||
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
||||
"""
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(self.vocab, string)
|
||||
cdef Doc tokens = Doc(self.vocab, string)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef int i = 0
|
||||
|
@ -101,7 +101,7 @@ cdef class Tokenizer:
|
|||
self._tokenize(tokens, &span, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
|
||||
cached = <_Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
return False
|
||||
|
@ -114,7 +114,7 @@ cdef class Tokenizer:
|
|||
idx = tokens.push_back(idx, &cached.data.tokens[i])
|
||||
return True
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef hash_t orig_key
|
||||
|
@ -167,7 +167,7 @@ cdef class Tokenizer:
|
|||
break
|
||||
return string
|
||||
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
|
||||
vector[const LexemeC*] *prefixes,
|
||||
vector[const LexemeC*] *suffixes) except -1:
|
||||
cdef bint cache_hit
|
||||
|
|
|
@ -28,13 +28,12 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
|||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef class Doc:
|
||||
cdef Pool mem
|
||||
cdef Vocab vocab
|
||||
|
||||
cdef TokenC* data
|
||||
|
||||
|
||||
cdef list _py_tokens
|
||||
cdef unicode _string
|
||||
cdef tuple _tag_strings
|
||||
|
@ -62,12 +61,12 @@ cdef class Token:
|
|||
cdef bint _owns_c_data
|
||||
|
||||
|
||||
cdef Tokens _seq
|
||||
cdef Doc _seq
|
||||
|
||||
@staticmethod
|
||||
cdef inline Token cinit(Vocab vocab, unicode string,
|
||||
const TokenC* token, int offset, int array_len,
|
||||
Tokens parent_seq):
|
||||
Doc parent_seq):
|
||||
if offset < 0 or offset >= array_len:
|
||||
|
||||
msg = "Attempt to access token at %d, max length %d"
|
||||
|
|
|
@ -78,7 +78,7 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return 0
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef class Doc:
|
||||
"""
|
||||
Container class for annotated text. Constructed via English.__call__ or
|
||||
Tokenizer.__call__.
|
||||
|
@ -185,7 +185,7 @@ cdef class Tokens:
|
|||
Yield a list of sentence Span objects, calculated from the dependency parse.
|
||||
"""
|
||||
cdef int i
|
||||
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
||||
cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
|
||||
start = 0
|
||||
for i in range(1, self.length):
|
||||
if self.data[i].sent_start:
|
||||
|
@ -370,7 +370,7 @@ cdef class Tokens:
|
|||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
via Tokens.__getitem__ and Tokens.__iter__.
|
||||
via Doc.__getitem__ and Doc.__iter__.
|
||||
"""
|
||||
def __cinit__(self, Vocab vocab, unicode string):
|
||||
self.vocab = vocab
|
||||
|
|
Loading…
Reference in New Issue
Block a user