* Rename Tokens to Doc

2025-11-18 08:45:50 +03:00 · 2015-07-08 18:53:00 +02:00 · 2015-07-08 18:53:00 +02:00 · bb522496dd
commit bb522496dd
parent d0fc7f5ba9
11 changed files with 41 additions and 43 deletions
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@ -9,7 +9,6 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from preshed.maps cimport PreshMapArray

 from .typedefs cimport hash_t, id_t
-from .tokens cimport Tokens


 cdef int arg_max(const weight_t* scores, const int n_classes) nogil
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -9,7 +9,7 @@ from ..syntax.arc_eager import ArcEager
 from ..syntax.ner import BiluoPushDown
 from ..syntax.parser import ParserFactory

-from ..tokens import Tokens
+from ..tokens import Doc
 from ..multi_words import RegexMerger

 from .pos import EnPosTagger
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
 from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
 from ..typedefs cimport id_t
 from ..structs cimport TokenC, Morphology, LexemeC
-from ..tokens cimport Tokens
+from ..tokens cimport Doc
 from ..morphology cimport set_morph_from_dict
 from .._ml cimport arg_max

@ -260,11 +260,11 @@ cdef class EnPosTagger:
                                                 'morphs.json'))))
        self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)

-    def __call__(self, Tokens tokens):
-        """Apply the tagger, setting the POS tags onto the Tokens object.
+    def __call__(self, Doc tokens):
+        """Apply the tagger, setting the POS tags onto the Doc object.

        Args:
-            tokens (Tokens): The tokens to be tagged.
+            tokens (Doc): The tokens to be tagged.
        """
        if tokens.length == 0:
            return 0
@ -282,7 +282,7 @@ cdef class EnPosTagger:
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

-    def tag_from_strings(self, Tokens tokens, object tag_strs):
+    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
            tokens.data[i].tag = self.strings[tag_strs[i]]
@ -291,7 +291,7 @@ cdef class EnPosTagger:
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

-    def train(self, Tokens tokens, object gold_tag_strs):
+    def train(self, Doc tokens, object gold_tag_strs):
        cdef int i
        cdef int loss
        cdef atom_t[N_CONTEXT_FIELDS] context
--- a/spacy/spans.pxd
+++ b/spacy/spans.pxd
@ -1,4 +1,4 @@
-from .tokens cimport Tokens
+from .tokens cimport Doc
 from .typedefs cimport flags_t, attr_id_t, attr_t
 from .parts_of_speech cimport univ_pos_t
 from .structs cimport Morphology, TokenC, LexemeC
@ -7,7 +7,7 @@ from .strings cimport StringStore


 cdef class Span:
-    cdef readonly Tokens _seq
+    cdef readonly Doc _seq
    cdef public int i
    cdef public int start
    cdef public int end
--- a/spacy/spans.pyx
+++ b/spacy/spans.pyx
@ -3,8 +3,8 @@ from collections import defaultdict


 cdef class Span:
-    """A slice from a Tokens object."""
-    def __cinit__(self, Tokens tokens, int start, int end, int label=0):
+    """A slice from a Doc object."""
+    def __cinit__(self, Doc tokens, int start, int end, int label=0):
        self._seq = tokens
        self.start = start
        self.end = end
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -4,7 +4,7 @@ from .._ml cimport Model

 from .arc_eager cimport TransitionSystem

-from ..tokens cimport Tokens, TokenC
+from ..tokens cimport Doc, TokenC


 cdef class Parser:
@ -12,5 +12,5 @@ cdef class Parser:
    cdef readonly Model model
    cdef readonly TransitionSystem moves

-    cdef int _greedy_parse(self, Tokens tokens) except -1
-    cdef int _beam_parse(self, Tokens tokens) except -1
+    cdef int _greedy_parse(self, Doc tokens) except -1
+    cdef int _beam_parse(self, Doc tokens) except -1
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -31,7 +31,7 @@ from thinc.learner cimport LinearModel
 from thinc.search cimport Beam
 from thinc.search cimport MaxViolation

-from ..tokens cimport Tokens, TokenC
+from ..tokens cimport Doc, TokenC
 from ..strings cimport StringStore


@ -75,20 +75,20 @@ cdef class Parser:
        templates = get_templates(self.cfg.features)
        self.model = Model(self.moves.n_moves, templates, model_dir)

-    def __call__(self, Tokens tokens):
+    def __call__(self, Doc tokens):
        if self.cfg.get('beam_width', 1) < 1:
            self._greedy_parse(tokens)
        else:
            self._beam_parse(tokens)

-    def train(self, Tokens tokens, GoldParse gold):
+    def train(self, Doc tokens, GoldParse gold):
        self.moves.preprocess_gold(gold)
        if self.cfg.beam_width < 1:
            return self._greedy_train(tokens, gold)
        else:
            return self._beam_train(tokens, gold)

-    cdef int _greedy_parse(self, Tokens tokens) except -1:
+    cdef int _greedy_parse(self, Doc tokens) except -1:
        cdef atom_t[CONTEXT_SIZE] context
        cdef int n_feats
        cdef Pool mem = Pool()
@ -106,7 +106,7 @@ cdef class Parser:
        self.moves.finalize_state(stcls)
        tokens.set_parse(stcls._sent)

-    cdef int _beam_parse(self, Tokens tokens) except -1:
+    cdef int _beam_parse(self, Doc tokens) except -1:
        cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
        words = [w.orth_ for w in tokens]
        beam.initialize(_init_state, tokens.length, tokens.data)
@ -118,7 +118,7 @@ cdef class Parser:
        tokens.set_parse(state._sent)
        _cleanup(beam)

-    def _greedy_train(self, Tokens tokens, GoldParse gold):
+    def _greedy_train(self, Doc tokens, GoldParse gold):
        cdef Pool mem = Pool()
        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
        self.moves.initialize_state(stcls)
@ -143,7 +143,7 @@ cdef class Parser:
            loss += cost
        return loss

-    def _beam_train(self, Tokens tokens, GoldParse gold_parse):
+    def _beam_train(self, Doc tokens, GoldParse gold_parse):
        cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
        pred.initialize(_init_state, tokens.length, tokens.data)
        pred.check_done(_check_final_state, NULL)
@ -190,7 +190,7 @@ cdef class Parser:
        beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
        beam.check_done(_check_final_state, NULL)

-    def _count_feats(self, dict counts, Tokens tokens, list hist, int inc):
+    def _count_feats(self, dict counts, Doc tokens, list hist, int inc):
        cdef atom_t[CONTEXT_SIZE] context
        cdef Pool mem = Pool()
        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -8,7 +8,7 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .structs cimport LexemeC, TokenC, Morphology, UniStr
 from .strings cimport StringStore
-from .tokens cimport Tokens
+from .tokens cimport Doc
 from .vocab cimport Vocab, _Cached


@ -27,13 +27,13 @@ cdef class Tokenizer:
    cdef object _suffix_re
    cdef object _infix_re

-    cpdef Tokens tokens_from_list(self, list strings)
+    cpdef Doc tokens_from_list(self, list strings)

-    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
-    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
+    cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
+    cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
    cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
                             vector[LexemeC*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
+    cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict

 from . import util
 from .util import read_lang_data
-from .tokens import Tokens
+from .tokens import Doc


 cdef class Tokenizer:
@ -38,9 +38,9 @@ cdef class Tokenizer:
        infix_re = re.compile(infix_re)
        return cls(vocab, rules, prefix_re, suffix_re, infix_re)

-    cpdef Tokens tokens_from_list(self, list strings):
+    cpdef Doc tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
-        cdef Tokens tokens = Tokens(self.vocab, ' '.join(strings))
+        cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
        if length == 0:
            return tokens
        cdef UniStr string_struct
@ -70,10 +70,10 @@ cdef class Tokenizer:
            string (unicode): The string to be tokenized.

        Returns:
-            tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
+            tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
        """
        cdef int length = len(string)
-        cdef Tokens tokens = Tokens(self.vocab, string)
+        cdef Doc tokens = Doc(self.vocab, string)
        if length == 0:
            return tokens
        cdef int i = 0
@ -101,7 +101,7 @@ cdef class Tokenizer:
                self._tokenize(tokens, &span, start, i)
        return tokens

-    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
+    cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
            return False
@ -114,7 +114,7 @@ cdef class Tokenizer:
                idx = tokens.push_back(idx, &cached.data.tokens[i])
        return True

-    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
+    cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes
        cdef hash_t orig_key
@ -167,7 +167,7 @@ cdef class Tokenizer:
                break
        return string

-    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
+    cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
                            vector[const LexemeC*] *prefixes,
                            vector[const LexemeC*] *suffixes) except -1:
        cdef bint cache_hit
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -28,13 +28,12 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)


-cdef class Tokens:
+cdef class Doc:
    cdef Pool mem
    cdef Vocab vocab

    cdef TokenC* data

-
    cdef list _py_tokens
    cdef unicode _string
    cdef tuple _tag_strings
@ -62,12 +61,12 @@ cdef class Token:
    cdef bint _owns_c_data


-    cdef Tokens _seq
+    cdef Doc _seq

    @staticmethod
    cdef inline Token cinit(Vocab vocab, unicode string,
                            const TokenC* token, int offset, int array_len,
-                            Tokens parent_seq):
+                            Doc parent_seq):
        if offset < 0 or offset >= array_len:

            msg = "Attempt to access token at %d, max length %d"
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -78,7 +78,7 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return 0


-cdef class Tokens:
+cdef class Doc:
    """
    Container class for annotated text.  Constructed via English.__call__ or
    Tokenizer.__call__.
@ -185,7 +185,7 @@ cdef class Tokens:
        Yield a list of sentence Span objects, calculated from the dependency parse.
        """
        cdef int i
-        cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
+        cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
        start = 0
        for i in range(1, self.length):
            if self.data[i].sent_start:
@ -370,7 +370,7 @@ cdef class Tokens:

 cdef class Token:
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
-    via Tokens.__getitem__ and Tokens.__iter__.
+    via Doc.__getitem__ and Doc.__iter__.
    """
    def __cinit__(self, Vocab vocab, unicode string):
        self.vocab = vocab