* Rename Doc.data to Doc.c

2025-12-14 21:54:18 +03:00 · 2015-11-04 00:15:14 +11:00 · 2015-11-04 00:15:14 +11:00 · 68f479e821
commit 68f479e821
parent 3ddea19b2b
8 changed files with 60 additions and 60 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -215,7 +215,7 @@ cdef class Matcher:
        cdef Pattern* state
        matches = []
        for token_i in range(doc.length):
-            token = &doc.data[token_i]
+            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able. Otherwise,
            # we over-write them (q doesn't advance)
@ -286,7 +286,7 @@ cdef class PhraseMatcher:
        for i in range(self.max_length):
            self._phrase_key[i] = 0
        for i, tag in enumerate(tags):
-            lexeme = self.vocab[tokens.data[i].lex.orth]
+            lexeme = self.vocab[tokens.c[i].lex.orth]
            lexeme.set_flag(tag, True)
            self._phrase_key[i] = lexeme.orth
        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
@ -309,7 +309,7 @@ cdef class PhraseMatcher:
        for i in range(self.max_length):
            self._phrase_key[i] = 0
        for i, j in enumerate(range(start, end)):
-            self._phrase_key[i] = doc.data[j].lex.orth
+            self._phrase_key[i] = doc.c[j].lex.orth
        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
        if self.phrase_ids.get(key):
            return True
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -84,7 +84,7 @@ cdef class Parser:
        return cls(strings, moves, model)
    def __call__(self, Doc tokens):
-        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
+        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls)
        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
@ -112,7 +112,7 @@ cdef class Parser:
    def train(self, Doc tokens, GoldParse gold):
        self.moves.preprocess_gold(gold)
-        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
+        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls)
        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
                                  self.model.n_feats, self.model.n_feats)
@ -143,7 +143,7 @@ cdef class StepwiseState:
    def __init__(self, Parser parser, Doc doc):
        self.parser = parser
        self.doc = doc
-        self.stcls = StateClass.init(doc.data, doc.length)
+        self.stcls = StateClass.init(doc.c, doc.length)
        self.parser.moves.initialize_state(self.stcls)
        self.eg = Example(self.parser.model.n_classes, CONTEXT_SIZE,
                          self.parser.model.n_feats, self.parser.model.n_feats)
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -141,9 +141,9 @@ cdef class Tagger:
        cdef int i
        cdef const weight_t* scores
        for i in range(tokens.length):
-            if tokens.data[i].pos == 0:
+            if tokens.c[i].pos == 0:
-                guess = self.predict(i, tokens.data)
+                guess = self.predict(i, tokens.c)
-                self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+                self.vocab.morphology.assign_tag(&tokens.c[i], guess)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
@ -154,7 +154,7 @@ cdef class Tagger:
    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
-            self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
+            self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
@ -170,13 +170,13 @@ cdef class Tagger:
                [g for g in gold_tag_strs if g is not None and g not in self.tag_names])
        correct = 0
        for i in range(tokens.length):
-            guess = self.update(i, tokens.data, golds[i])
+            guess = self.update(i, tokens.c, golds[i])
            loss = golds[i] != -1 and guess != golds[i]
-            self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+            self.vocab.morphology.assign_tag(&tokens.c[i], guess)
            correct += loss == 0
-            self.freqs[TAG][tokens.data[i].tag] += 1
+            self.freqs[TAG][tokens.c[i].tag] += 1
        return correct
    cdef int predict(self, int i, const TokenC* tokens) except -1:
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -113,7 +113,7 @@ cdef class Tokenizer:
                        self._tokenize(tokens, span, key)
                in_ws = not in_ws
                if uc == ' ':
-                    tokens.data[tokens.length - 1].spacy = True
+                    tokens.c[tokens.length - 1].spacy = True
                    start = i + 1
                else:
                    start = i
@ -125,7 +125,7 @@ cdef class Tokenizer:
            cache_hit = self._try_cache(key, tokens)
            if not cache_hit:
                self._tokenize(tokens, span, key)
-            tokens.data[tokens.length - 1].spacy = string[-1] == ' '
+            tokens.c[tokens.length - 1].spacy = string[-1] == ' '
        return tokens
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
@ -148,7 +148,7 @@ cdef class Tokenizer:
        orig_size = tokens.length
        span = self._split_affixes(span, &prefixes, &suffixes)
        self._attach_tokens(tokens, span, &prefixes, &suffixes)
-        self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
+        self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
    cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes,
                                vector[const LexemeC*] *suffixes):
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -26,7 +26,7 @@ cdef class Doc:
    cdef public object _vector
    cdef public object _vector_norm
-    cdef TokenC* data
+    cdef TokenC* c
    cdef public bint is_tagged
    cdef public bint is_parsed
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -73,7 +73,7 @@ cdef class Doc:
            data_start[i].lex = &EMPTY_LEXEME
            data_start[i].l_edge = i
            data_start[i].r_edge = i
-        self.data = data_start + PADDING
+        self.c = data_start + PADDING
        self.max_length = size
        self.length = 0
        self.is_tagged = False
@ -97,7 +97,7 @@ cdef class Doc:
        if self._py_tokens[i] is not None:
            return self._py_tokens[i]
        else:
-            return Token.cinit(self.vocab, &self.data[i], i, self)
+            return Token.cinit(self.vocab, &self.c[i], i, self)
    def __iter__(self):
        """Iterate over the tokens.
@ -110,7 +110,7 @@ cdef class Doc:
            if self._py_tokens[i] is not None:
                yield self._py_tokens[i]
            else:
-                yield Token.cinit(self.vocab, &self.data[i], i, self)
+                yield Token.cinit(self.vocab, &self.c[i], i, self)
    def __len__(self):
        return self.length
@ -187,7 +187,7 @@ cdef class Doc:
            cdef int label = 0
            output = []
            for i in range(self.length):
-                token = &self.data[i]
+                token = &self.c[i]
                if token.ent_iob == 1:
                    assert start != -1
                elif token.ent_iob == 2 or token.ent_iob == 0:
@ -212,23 +212,23 @@ cdef class Doc:
            # 4. Test more nuanced date and currency regex
            cdef int i
            for i in range(self.length):
-                self.data[i].ent_type = 0
+                self.c[i].ent_type = 0
-                self.data[i].ent_iob = 0
+                self.c[i].ent_iob = 0
            cdef attr_t ent_type
            cdef int start, end
            for ent_type, start, end in ents:
                if ent_type is None or ent_type < 0:
                    # Mark as O
                    for i in range(start, end):
-                        self.data[i].ent_type = 0
+                        self.c[i].ent_type = 0
-                        self.data[i].ent_iob = 2
+                        self.c[i].ent_iob = 2
                else:
                    # Mark (inside) as I
                    for i in range(start, end):
-                        self.data[i].ent_type = ent_type
+                        self.c[i].ent_type = ent_type
-                        self.data[i].ent_iob = 1
+                        self.c[i].ent_iob = 1
                    # Set start as B
-                    self.data[start].ent_iob = 3
+                    self.c[start].ent_iob = 3
    @property
    def noun_chunks(self):
@ -245,7 +245,7 @@ cdef class Doc:
        np_deps = [self.vocab.strings[label] for label in labels]
        np_label = self.vocab.strings['NP']
        for i in range(self.length):
-            word = &self.data[i]
+            word = &self.c[i]
            if word.pos == NOUN and word.dep in np_deps:
                yield Span(self, word.l_edge, i+1, label=np_label)
@ -263,7 +263,7 @@ cdef class Doc:
        cdef int i
        start = 0
        for i in range(1, self.length):
-            if self.data[i].sent_start:
+            if self.c[i].sent_start:
                yield Span(self, start, i)
                start = i
        yield Span(self, start, self.length)
@ -271,7 +271,7 @@ cdef class Doc:
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
-        cdef TokenC* t = &self.data[self.length]
+        cdef TokenC* t = &self.c[self.length]
        if LexemeOrToken is const_TokenC_ptr:
            t[0] = lex_or_tok[0]
        else:
@ -310,7 +310,7 @@ cdef class Doc:
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
-                output[i, j] = get_token_attr(&self.data[i], feature)
+                output[i, j] = get_token_attr(&self.c[i], feature)
        return output
    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
@ -340,11 +340,11 @@ cdef class Doc:
        # Take this check out of the loop, for a bit of extra speed
        if exclude is None:
            for i in range(self.length):
-                counts.inc(get_token_attr(&self.data[i], attr_id), 1)
+                counts.inc(get_token_attr(&self.c[i], attr_id), 1)
        else:
            for i in range(self.length):
                if not exclude(self[i]):
-                    attr = get_token_attr(&self.data[i], attr_id)
+                    attr = get_token_attr(&self.c[i], attr_id)
                    counts.inc(attr, 1)
        if output_dict:
            return dict(counts)
@ -357,12 +357,12 @@ cdef class Doc:
        # words out-of-bounds, and get out-of-bounds markers.
        # Now that we want to realloc, we need the address of the true start,
        # so we jump the pointer back PADDING places.
-        cdef TokenC* data_start = self.data - PADDING
+        cdef TokenC* data_start = self.c - PADDING
        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
-        self.data = data_start + PADDING
+        self.c = data_start + PADDING
        cdef int i
        for i in range(self.length, self.max_length + PADDING):
-            self.data[i].lex = &EMPTY_LEXEME
+            self.c[i].lex = &EMPTY_LEXEME
    cdef int set_parse(self, const TokenC* parsed) except -1:
        # TODO: This method is fairly misleading atm. It's used by Parser
@ -371,14 +371,14 @@ cdef class Doc:
        # Probably we should use from_array?
        self.is_parsed = True
        for i in range(self.length):
-            self.data[i] = parsed[i]
+            self.c[i] = parsed[i]
-            assert self.data[i].l_edge <= i
+            assert self.c[i].l_edge <= i
-            assert self.data[i].r_edge >= i
+            assert self.c[i].r_edge >= i
    def from_array(self, attrs, array):
        cdef int i, col
        cdef attr_id_t attr_id
-        cdef TokenC* tokens = self.data
+        cdef TokenC* tokens = self.c
        cdef int length = len(array)
        cdef attr_t[:] values
        for col, attr_id in enumerate(attrs): 
@ -412,7 +412,7 @@ cdef class Doc:
                    tokens[i].ent_type = values[i]
            else:
                raise ValueError("Unknown attribute ID: %d" % attr_id)
-        set_children_from_heads(self.data, self.length)
+        set_children_from_heads(self.c, self.length)
        return self
    def to_bytes(self):
@ -447,9 +447,9 @@ cdef class Doc:
        cdef int start = -1
        cdef int end = -1
        for i in range(self.length):
-            if self.data[i].idx == start_idx:
+            if self.c[i].idx == start_idx:
                start = i
-            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
+            if (self.c[i].idx + self.c[i].lex.length) == end_idx:
                if start == -1:
                    return None
                end = i + 1
@ -464,10 +464,10 @@ cdef class Doc:
            new_orth = new_orth[:-len(span[-1].whitespace_)]
        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
        # House the new merged token where it starts
-        cdef TokenC* token = &self.data[start]
+        cdef TokenC* token = &self.c[start]
        # Update fields
        token.lex = lex
-        token.spacy = self.data[end-1].spacy
+        token.spacy = self.c[end-1].spacy
        if tag in self.vocab.morphology.tag_map:
            self.vocab.morphology.assign_tag(token, tag)
        else:
@ -486,31 +486,31 @@ cdef class Doc:
        span_root = span.root.i
        token.dep = span.root.dep
        for i in range(self.length):
-            self.data[i].head += i
+            self.c[i].head += i
        # Set the head of the merged token, and its dep relation, from the Span
-        token.head = self.data[span_root].head
+        token.head = self.c[span_root].head
        # Adjust deps before shrinking tokens
        # Tokens which point into the merged token should now point to it
        # Subtract the offset from all tokens which point to >= end
        offset = (end - start) - 1
        for i in range(self.length):
-            head_idx = self.data[i].head
+            head_idx = self.c[i].head
            if start <= head_idx < end:
-                self.data[i].head = start
+                self.c[i].head = start
            elif head_idx >= end:
-                self.data[i].head -= offset
+                self.c[i].head -= offset
        # Now compress the token array
        for i in range(end, self.length):
-            self.data[i - offset] = self.data[i]
+            self.c[i - offset] = self.c[i]
        for i in range(self.length - offset, self.length):
-            memset(&self.data[i], 0, sizeof(TokenC))
+            memset(&self.c[i], 0, sizeof(TokenC))
-            self.data[i].lex = &EMPTY_LEXEME
+            self.c[i].lex = &EMPTY_LEXEME
        self.length -= offset
        for i in range(self.length):
            # ...And, set heads back to a relative position
-            self.data[i].head -= i
+            self.c[i].head -= i
        # Set the left/right children, left/right edges
-        set_children_from_heads(self.data, self.length)
+        set_children_from_heads(self.c, self.length)
        # Clear the cached Python objects
        self._py_tokens = [None] * self.length
        # Return the merged Python object
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -139,12 +139,12 @@ cdef class Span:
        def __get__(self):
            # This should probably be called 'head', and the other one called
            # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
-            cdef const TokenC* start = &self.doc.data[self.start]
+            cdef const TokenC* start = &self.doc.c[self.start]
-            cdef const TokenC* end = &self.doc.data[self.end]
+            cdef const TokenC* end = &self.doc.c[self.end]
            head = start
            while start <= (head + head.head) < end and head.head != 0:
                head += head.head
-            return self.doc[head - self.doc.data]
+            return self.doc[head - self.doc.c]
    property lefts:
        """Tokens that are to the left of the Span, whose head is within the Span."""
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -31,7 +31,7 @@ cdef class Token:
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        self.vocab = vocab
        self.doc = doc
-        self.c = &self.doc.data[offset]
+        self.c = &self.doc.c[offset]
        self.i = offset
        self.array_len = doc.length