From bb522496dd4b17135ee33b5daa388c5b7491feea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Jul 2015 18:53:00 +0200 Subject: [PATCH] * Rename Tokens to Doc --- spacy/_ml.pxd | 1 - spacy/en/__init__.py | 2 +- spacy/en/pos.pyx | 12 ++++++------ spacy/spans.pxd | 4 ++-- spacy/spans.pyx | 4 ++-- spacy/syntax/parser.pxd | 6 +++--- spacy/syntax/parser.pyx | 16 ++++++++-------- spacy/tokenizer.pxd | 10 +++++----- spacy/tokenizer.pyx | 16 ++++++++-------- spacy/tokens.pxd | 7 +++---- spacy/tokens.pyx | 6 +++--- 11 files changed, 41 insertions(+), 43 deletions(-) diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index add162e69..e39b3a5e3 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -9,7 +9,6 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from preshed.maps cimport PreshMapArray from .typedefs cimport hash_t, id_t -from .tokens cimport Tokens cdef int arg_max(const weight_t* scores, const int n_classes) nogil diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index ca4518a60..6baf444ab 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -9,7 +9,7 @@ from ..syntax.arc_eager import ArcEager from ..syntax.ner import BiluoPushDown from ..syntax.parser import ParserFactory -from ..tokens import Tokens +from ..tokens import Doc from ..multi_words import RegexMerger from .pos import EnPosTagger diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index dd541c72a..97d0613cf 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL from ..typedefs cimport id_t from ..structs cimport TokenC, Morphology, LexemeC -from ..tokens cimport Tokens +from ..tokens cimport Doc from ..morphology cimport set_morph_from_dict from .._ml cimport arg_max @@ -260,11 +260,11 @@ cdef class EnPosTagger: 'morphs.json')))) self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - def __call__(self, Tokens tokens): - """Apply the tagger, setting the POS tags onto the Tokens object. + def __call__(self, Doc tokens): + """Apply the tagger, setting the POS tags onto the Doc object. Args: - tokens (Tokens): The tokens to be tagged. + tokens (Doc): The tokens to be tagged. """ if tokens.length == 0: return 0 @@ -282,7 +282,7 @@ cdef class EnPosTagger: tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length - def tag_from_strings(self, Tokens tokens, object tag_strs): + def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): tokens.data[i].tag = self.strings[tag_strs[i]] @@ -291,7 +291,7 @@ cdef class EnPosTagger: tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length - def train(self, Tokens tokens, object gold_tag_strs): + def train(self, Doc tokens, object gold_tag_strs): cdef int i cdef int loss cdef atom_t[N_CONTEXT_FIELDS] context diff --git a/spacy/spans.pxd b/spacy/spans.pxd index ffbac9c36..8afcdfa6a 100644 --- a/spacy/spans.pxd +++ b/spacy/spans.pxd @@ -1,4 +1,4 @@ -from .tokens cimport Tokens +from .tokens cimport Doc from .typedefs cimport flags_t, attr_id_t, attr_t from .parts_of_speech cimport univ_pos_t from .structs cimport Morphology, TokenC, LexemeC @@ -7,7 +7,7 @@ from .strings cimport StringStore cdef class Span: - cdef readonly Tokens _seq + cdef readonly Doc _seq cdef public int i cdef public int start cdef public int end diff --git a/spacy/spans.pyx b/spacy/spans.pyx index 6b593e381..19a1f640c 100644 --- a/spacy/spans.pyx +++ b/spacy/spans.pyx @@ -3,8 +3,8 @@ from collections import defaultdict cdef class Span: - """A slice from a Tokens object.""" - def __cinit__(self, Tokens tokens, int start, int end, int label=0): + """A slice from a Doc object.""" + def __cinit__(self, Doc tokens, int start, int end, int label=0): self._seq = tokens self.start = start self.end = end diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 103ff9c02..dfc5c74a2 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -4,7 +4,7 @@ from .._ml cimport Model from .arc_eager cimport TransitionSystem -from ..tokens cimport Tokens, TokenC +from ..tokens cimport Doc, TokenC cdef class Parser: @@ -12,5 +12,5 @@ cdef class Parser: cdef readonly Model model cdef readonly TransitionSystem moves - cdef int _greedy_parse(self, Tokens tokens) except -1 - cdef int _beam_parse(self, Tokens tokens) except -1 + cdef int _greedy_parse(self, Doc tokens) except -1 + cdef int _beam_parse(self, Doc tokens) except -1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index e881fef7d..a8fe20a31 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -31,7 +31,7 @@ from thinc.learner cimport LinearModel from thinc.search cimport Beam from thinc.search cimport MaxViolation -from ..tokens cimport Tokens, TokenC +from ..tokens cimport Doc, TokenC from ..strings cimport StringStore @@ -75,20 +75,20 @@ cdef class Parser: templates = get_templates(self.cfg.features) self.model = Model(self.moves.n_moves, templates, model_dir) - def __call__(self, Tokens tokens): + def __call__(self, Doc tokens): if self.cfg.get('beam_width', 1) < 1: self._greedy_parse(tokens) else: self._beam_parse(tokens) - def train(self, Tokens tokens, GoldParse gold): + def train(self, Doc tokens, GoldParse gold): self.moves.preprocess_gold(gold) if self.cfg.beam_width < 1: return self._greedy_train(tokens, gold) else: return self._beam_train(tokens, gold) - cdef int _greedy_parse(self, Tokens tokens) except -1: + cdef int _greedy_parse(self, Doc tokens) except -1: cdef atom_t[CONTEXT_SIZE] context cdef int n_feats cdef Pool mem = Pool() @@ -106,7 +106,7 @@ cdef class Parser: self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) - cdef int _beam_parse(self, Tokens tokens) except -1: + cdef int _beam_parse(self, Doc tokens) except -1: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) words = [w.orth_ for w in tokens] beam.initialize(_init_state, tokens.length, tokens.data) @@ -118,7 +118,7 @@ cdef class Parser: tokens.set_parse(state._sent) _cleanup(beam) - def _greedy_train(self, Tokens tokens, GoldParse gold): + def _greedy_train(self, Doc tokens, GoldParse gold): cdef Pool mem = Pool() cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) @@ -143,7 +143,7 @@ cdef class Parser: loss += cost return loss - def _beam_train(self, Tokens tokens, GoldParse gold_parse): + def _beam_train(self, Doc tokens, GoldParse gold_parse): cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width) pred.initialize(_init_state, tokens.length, tokens.data) pred.check_done(_check_final_state, NULL) @@ -190,7 +190,7 @@ cdef class Parser: beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) - def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): + def _count_feats(self, dict counts, Doc tokens, list hist, int inc): cdef atom_t[CONTEXT_SIZE] context cdef Pool mem = Pool() cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e0340602b..d5c68f8e5 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -8,7 +8,7 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .structs cimport LexemeC, TokenC, Morphology, UniStr from .strings cimport StringStore -from .tokens cimport Tokens +from .tokens cimport Doc from .vocab cimport Vocab, _Cached @@ -27,13 +27,13 @@ cdef class Tokenizer: cdef object _suffix_re cdef object _infix_re - cpdef Tokens tokens_from_list(self, list strings) + cpdef Doc tokens_from_list(self, list strings) - cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 - cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 + cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1 + cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1 cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except NULL - cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, + cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 54e0c88d2..d287ec9ca 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict from . import util from .util import read_lang_data -from .tokens import Tokens +from .tokens import Doc cdef class Tokenizer: @@ -38,9 +38,9 @@ cdef class Tokenizer: infix_re = re.compile(infix_re) return cls(vocab, rules, prefix_re, suffix_re, infix_re) - cpdef Tokens tokens_from_list(self, list strings): + cpdef Doc tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) - cdef Tokens tokens = Tokens(self.vocab, ' '.join(strings)) + cdef Doc tokens = Doc(self.vocab, ' '.join(strings)) if length == 0: return tokens cdef UniStr string_struct @@ -70,10 +70,10 @@ cdef class Tokenizer: string (unicode): The string to be tokenized. Returns: - tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs. + tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. """ cdef int length = len(string) - cdef Tokens tokens = Tokens(self.vocab, string) + cdef Doc tokens = Doc(self.vocab, string) if length == 0: return tokens cdef int i = 0 @@ -101,7 +101,7 @@ cdef class Tokenizer: self._tokenize(tokens, &span, start, i) return tokens - cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: + cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: return False @@ -114,7 +114,7 @@ cdef class Tokenizer: idx = tokens.push_back(idx, &cached.data.tokens[i]) return True - cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: + cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef hash_t orig_key @@ -167,7 +167,7 @@ cdef class Tokenizer: break return string - cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, + cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes) except -1: cdef bint cache_hit diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 8b3ff9fe9..ca753501e 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -28,13 +28,12 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) -cdef class Tokens: +cdef class Doc: cdef Pool mem cdef Vocab vocab cdef TokenC* data - cdef list _py_tokens cdef unicode _string cdef tuple _tag_strings @@ -62,12 +61,12 @@ cdef class Token: cdef bint _owns_c_data - cdef Tokens _seq + cdef Doc _seq @staticmethod cdef inline Token cinit(Vocab vocab, unicode string, const TokenC* token, int offset, int array_len, - Tokens parent_seq): + Doc parent_seq): if offset < 0 or offset >= array_len: msg = "Attempt to access token at %d, max length %d" diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 346565c14..9fa07f1ee 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -78,7 +78,7 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil: return 0 -cdef class Tokens: +cdef class Doc: """ Container class for annotated text. Constructed via English.__call__ or Tokenizer.__call__. @@ -185,7 +185,7 @@ cdef class Tokens: Yield a list of sentence Span objects, calculated from the dependency parse. """ cdef int i - cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:]) + cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:]) start = 0 for i in range(1, self.length): if self.data[i].sent_start: @@ -370,7 +370,7 @@ cdef class Tokens: cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created - via Tokens.__getitem__ and Tokens.__iter__. + via Doc.__getitem__ and Doc.__iter__. """ def __cinit__(self, Vocab vocab, unicode string): self.vocab = vocab