* Tmp

2025-07-15 18:52:29 +03:00 · 2014-12-24 17:42:00 +11:00 · 2014-12-24 17:42:00 +11:00 · b8b65903fc
commit b8b65903fc
parent 75a6930ad9
10 changed files with 161 additions and 222 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -23,6 +23,7 @@ class English(object):
        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
        self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
        self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None
        self.strings = self.vocab.strings
    def __call__(self, text, tag=True, parse=True):
        tokens = self.tokenizer.tokenize(text)
@ -31,3 +32,10 @@ class English(object):
        if self.parser and parse:
            self.parser.parse(tokens)
        return tokens
    @property
    def tags(self):
        if self.tagger is None:
            return []
        else:
            return self.tagger.tag_names
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -4,10 +4,12 @@ from ..typedefs cimport ID as _ID
 from ..typedefs cimport SIC as _SIC
 from ..typedefs cimport SHAPE as _SHAPE
 from ..typedefs cimport DENSE as _DENSE
 from ..typedefs cimport CLUSTER as _CLUSTER
 from ..typedefs cimport SHAPE as _SHAPE
 from ..typedefs cimport PREFIX as _PREFIX
 from ..typedefs cimport SUFFIX as _SUFFIX
 from ..typedefs cimport LEMMA as _LEMMA
 from ..typedefs cimport POS as _POS
 # Work around the lack of global cpdef variables
@ -25,8 +27,10 @@ cpdef enum:
    ID = _ID
    SIC = _SIC
-    SHAPE = _DENSE
+    SHAPE = _SHAPE
-    DENSE = _SHAPE
+    DENSE = _DENSE
    PREFIX = _PREFIX
    SUFFIX = _SUFFIX
    CLUSTER = _CLUSTER
    LEMMA = _LEMMA
    POS = _POS
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,3 +1,4 @@
 # cython: profile=True
 from os import path
 import json
@ -228,7 +229,7 @@ cdef class EnPosTagger(Tagger):
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_context(context, i, t)
-            t[i].pos = self.predict(context)
+            t[i].fine_pos = self.predict(context)
            self.set_morph(i, t)
    def train(self, Tokens tokens, golds):
@ -238,13 +239,14 @@ cdef class EnPosTagger(Tagger):
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_context(context, i, t)
-            t[i].pos = self.predict(context, [golds[i]])
+            t[i].fine_pos = self.predict(context, [golds[i]])
            self.set_morph(i, t)
-            c += t[i].pos == golds[i]
+            c += t[i].fine_pos == golds[i]
        return c
    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
-        cdef const PosTag* tag = &self.tags[tokens[i].pos]
+        cdef const PosTag* tag = &self.tags[tokens[i].fine_pos]
        tokens[i].pos = tag.pos
        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
        if cached is NULL:
            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -40,8 +40,9 @@ cdef struct PosTag:
 cdef struct TokenC:
    const Lexeme* lex
    Morphology morph
    univ_tag_t pos
    int fine_pos
    int idx
    int pos
    int lemma
    int sense
    int head
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -13,8 +13,7 @@ from itertools import combinations
 from ..tokens cimport TokenC
 from ._state cimport State
 from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
-from ._state cimport has_head, get_left, get_right
+from ._state cimport get_left, get_right
 from ._state cimport count_left_kids, count_right_kids
 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
@ -25,12 +24,10 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[3] = 0
        context[4] = 0
        context[5] = 0
        context[6] = 0
    else:
        context[0] = token.lex.sic
-        context[1] = token.lemma
+        context[1] = token.pos
-        context[2] = token.pos
+        context[2] = token.lex.cluster
        context[3] = token.lex.cluster
        # We've read in the string little-endian, so now we can take & (2**n)-1
        # to get the first n bits of the cluster.
        # e.g. s = "1110010101"
@ -43,9 +40,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        # What we're doing here is picking a number where all bits are 1, e.g.
        # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
        # the source that are set to 1.
-        context[4] = token.lex.cluster & 63
+        context[3] = token.lex.cluster & 63
-        context[5] = token.lex.cluster & 15
+        context[4] = token.lex.cluster & 15
-        context[6] = token.dep_tag if has_head(token) else 0
+        context[5] = token.dep_tag
 cdef int fill_context(atom_t* context, State* state) except -1:
@ -69,148 +66,12 @@ cdef int fill_context(atom_t* context, State* state) except -1:
        context[dist] = state.stack[0] - state.i
    else:
        context[dist] = 0
-    context[N0lv] = max(count_left_kids(get_n0(state)), 5)
+    context[N0lv] = 0 
-    context[S0lv] = max(count_left_kids(get_s0(state)), 5)
+    context[S0lv] = 0
-    context[S0rv] = max(count_right_kids(get_s0(state)), 5)
+    context[S0rv] = 0
-    context[S1lv] = max(count_left_kids(get_s1(state)), 5)
+    context[S1lv] = 0
-    context[S1rv] = max(count_right_kids(get_s1(state)), 5)
+    context[S1rv] = 0
    context[S0_has_head] = 0
    context[S1_has_head] = 0
    context[S2_has_head] = 0
    if state.stack_len >= 1:
        context[S0_has_head] = has_head(get_s0(state)) + 1
        if state.stack_len >= 2:
            context[S1_has_head] = has_head(get_s1(state)) + 1
            if state.stack_len >= 3:
                context[S2_has_head] = has_head(get_s2(state))
 unigrams = (
    (S2W, S2p),
    (S2c6, S2p),
    (S1W, S1p),
    (S1c6, S1p),
    (S0W, S0p),
    (S0c6, S0p),
    (N0W, N0p),
    (N0p,),
    (N0c,),
    (N0c6, N0p),
    (N0L,),
    (N1W, N1p),
    (N1c6, N1p),
    (N2W, N2p),
    (N2c6, N2p),
    (S0r2W, S0r2p),
    (S0r2c6, S0r2p),
    (S0r2L,),
    (S0rW, S0rp),
    (S0rc6, S0rp),
    (S0rL,),
    (S0l2W, S0l2p),
    (S0l2c6, S0l2p),
    (S0l2L,),
    (S0lW, S0lp),
    (S0lc6, S0lp),
    (S0lL,),
    (N0l2W, N0l2p),
    (N0l2c6, N0l2p),
    (N0l2L,),
    (N0lW, N0lp),
    (N0lc6, N0lp),
    (N0lL,),
 )
 s0_n0 = (
    (S0W, S0p, N0W, N0p),
    (S0c, S0p, N0c, N0p),
    (S0c6, S0p, N0c6, N0p),
    (S0c4, S0p, N0c4, N0p),
    (S0p, N0p),
    (S0W, N0p),
    (S0p, N0W),
    (S0W, N0c),
    (S0c, N0W),
    (S0p, N0c),
    (S0c, N0p),
    (S0W, S0rp, N0p),
    (S0p, S0rp, N0p),
    (S0p, N0lp, N0W),
    (S0p, N0lp, N0p),
 )
 s1_n0 = (
    (S1p, N0p),
    (S1c, N0c),
    (S1c, N0p),
    (S1p, N0c),
    (S1W, S1p, N0p),
    (S1p, N0W, N0p),
    (S1c6, S1p, N0c6, N0p),
 )
 s0_n1 = (
    (S0p, N1p),
    (S0c, N1c),
    (S0c, N1p),
    (S0p, N1c),
    (S0W, S0p, N1p),
    (S0p, N1W, N1p),
    (S0c6, S0p, N1c6, N1p),
 )
 n0_n1 = (
    (N0W, N0p, N1W, N1p),
    (N0W, N0p, N1p),
    (N0p, N1W, N1p),
    (N0c, N0p, N1c, N1p),
    (N0c6, N0p, N1c6, N1p),
    (N0c, N1c),
    (N0p, N1c),
 )
 tree_shape = (
    (dist,),
    (S0p, S0_has_head, S1_has_head, S2_has_head),
    (S0p, S0lv, S0rv),
    (N0p, N0lv),
 )
 trigrams = (
    (N0p, N1p, N2p),
    (S0p, S0lp, S0l2p),
    (S0p, S0rp, S0r2p),
    (S0p, S1p, S2p),
    (S1p, S0p, N0p),
    (S0p, S0lp, N0p),
    (S0p, N0p, N0lp),
    (N0p, N0lp, N0l2p),
    (S0W, S0p, S0rL, S0r2L),
    (S0p, S0rL, S0r2L),
    (S0W, S0p, S0lL, S0l2L),
    (S0p, S0lL, S0l2L),
    (N0W, N0p, N0lL, N0l2L),
    (N0p, N0lL, N0l2L),
 )
 arc_eager = (
    (S0w, S0p),
@ -225,6 +86,7 @@ arc_eager = (
    (N2w, N2p),
    (N2w,),
    (N2p,),
    (S0w, S0p, N0w, N0p),
    (S0w, S0p, N0w),
    (S0w, N0w, N0p),
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -6,7 +6,7 @@ cimport numpy as np
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
-from .typedefs cimport flags_t
+from .typedefs cimport flags_t, attr_id_t, attr_t
 from .structs cimport Morphology, TokenC, Lexeme
 from .vocab cimport Vocab
 from .strings cimport StringStore
@ -20,6 +20,13 @@ ctypedef fused LexemeOrToken:
    TokenC_ptr
 cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil
 cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
 cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
 cdef class Tokens:
    cdef Pool mem
    cdef Vocab vocab
@ -36,28 +43,5 @@ cdef class Tokens:
 cdef class Token:
-    cdef readonly StringStore string_store
+    cdef Tokens _seq
-    cdef public int i
+    cdef readonly int i
    cdef public int idx
    cdef readonly int pos_id
    cdef readonly int dep_id
    cdef int lemma
    cdef public int head
    cdef public int dep_tag
    cdef public atom_t id
    cdef public atom_t cluster
    cdef public atom_t length
    cdef public atom_t postype
    cdef public atom_t sensetype
    cdef public atom_t sic
    cdef public atom_t norm
    cdef public atom_t shape
    cdef public atom_t asciied
    cdef public atom_t prefix
    cdef public atom_t suffix
    cdef public float prob
    cdef public flags_t flags
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,16 +2,17 @@
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
-from .lexeme cimport get_attr, EMPTY_LEXEME
+from .vocab cimport EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
 from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
 from .typedefs cimport POS, LEMMA
 cimport cython
 import numpy as np
 cimport numpy as np
 POS = 0
 ENTITY = 0
 DEF PADDING = 5
@ -23,6 +24,40 @@ cdef int bounds_check(int i, int length, int padding) except -1:
        raise IndexError
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
    if feat_name == LEMMA:
        return token.lemma
    elif feat_name == POS:
        return token.pos
    else:
        return get_lex_attr(token.lex, feat_name)
 cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
    if feat_name < (sizeof(flags_t) * 8):
        return check_flag(lex, feat_name)
    elif feat_name == ID:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
    elif feat_name == DENSE:
        return lex.dense
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
        return lex.suffix
    elif feat_name == LENGTH:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
    else:
        return 0
 cdef class Tokens:
    """A sequence of references to Lexeme objects.
@ -52,9 +87,7 @@ cdef class Tokens:
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Token(self.vocab.strings, i, self.data[i].idx, self.data[i].pos,
+        return Token(self, i)
                     self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
                     self.data[i].lex[0])
    def __iter__(self):
        for i in range(self.length):
@ -71,6 +104,7 @@ cdef class Tokens:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
        t.idx = idx
        self.length += 1
        return idx + t.lex.length
@ -82,7 +116,7 @@ cdef class Tokens:
        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
-                output[i, j] = get_attr(self.data[i].lex, feature)
+                output[i, j] = get_token_attr(&self.data[i], feature)
        return output
    def count_by(self, attr_id_t attr_id):
@ -92,10 +126,7 @@ cdef class Tokens:
        cdef PreshCounter counts = PreshCounter(2 ** 8)
        for i in range(self.length):
-            if attr_id == LEMMA:
+            attr = get_token_attr(&self.data[i], attr_id)
                attr = self.data[i].lemma
            else:
                attr = get_attr(self.data[i].lex, attr_id)
            counts.inc(attr, 1)
        return dict(counts)
@ -117,49 +148,68 @@ cdef class Tokens:
@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx,
+    def __init__(self, Tokens tokens, int i):
-                 int pos, int lemma, int head, int dep_tag, dict lex):
+        self._seq = tokens
        self.string_store = string_store
        self.idx = idx
        self.pos_id = pos
        self.i = i
        self.head = head
        self.dep_id = dep_tag
        self.id = lex['id']
-        self.lemma = lemma
+    def __unicode__(self):
-        
+        cdef const TokenC* t = &self._seq.data[self.i]
-        self.cluster = lex['cluster']
+        cdef int end_idx = t.idx + t.lex.length
-        self.length = lex['length']
+        if self.i + 1 == self._seq.length:
-        self.postype = lex['pos_type']
+            return self.string
-        self.sensetype = 0
+        if end_idx == t[1].idx:
-        self.sic = lex['sic']
+            return self.string
-        self.norm = lex['dense']
+        else:
-        self.shape = lex['shape']
+            return self.string + ' '
        self.suffix = lex['suffix']
        self.prefix = lex['prefix']
-        self.prob = lex['prob']
+    def __len__(self):
-        self.flags = lex['flags']
+        return self._seq.data[self.i].lex.length
    property idx:
        def __get__(self):
            return self._seq.data[self.i].idx
    property length:
        def __get__(self):
            return self._seq.data[self.i].lex.length
    property cluster:
        def __get__(self):
            return self._seq.data[self.i].lex.cluster
    property string:
        def __get__(self):
-            if self.sic == 0:
+            cdef const TokenC* t = &self._seq.data[self.i]
            if t.lex.sic == 0:
                return ''
-            cdef bytes utf8string = self.string_store[self.sic]
+            cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
            return utf8string.decode('utf8')
    property lemma:
        def __get__(self):
-            if self.lemma == 0:
+            cdef const TokenC* t = &self._seq.data[self.i]
            if t.lemma == 0:
                return self.string
-            cdef bytes utf8string = self.string_store[self.lemma]
+            cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
            return utf8string.decode('utf8')
    property dep:
        def __get__(self):
-            return self.string_store.dep_tags[self.dep_id]
+            return self._seq.data[self.i].dep_tag
    property pos:
        def __get__(self):
-            return self.pos_id
+            return self._seq.data[self.i].pos
    property fine_pos:
        def __get__(self):
            return self._seq.data[self.i].fine_pos
    property sic:
        def __get__(self):
            return self._seq.data[self.i].lex.sic
    property head:
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            return Token(self._seq, self.i + t.head)
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -99,6 +99,7 @@ cpdef enum attr_id_t:
    CLUSTER
    POS_TYPE
    LEMMA
    POS
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -9,6 +9,9 @@ from .typedefs cimport utf8_t, id_t, hash_t
 from .strings cimport StringStore
 cdef Lexeme EMPTY_LEXEME
 cdef union LexemesOrTokens:
    const Lexeme* const* lexemes
    TokenC* tokens
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,10 +1,34 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memset
 from os import path
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 from .strings cimport slice_unicode
 from . import orth
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
                  StringStore string_store, dict props) except *:
    cdef Lexeme lex
    lex.id = i
    lex.length = len(string)
    lex.sic = string_store[string]
    lex.cluster = props.get('cluster', 0)
    lex.pos_type = props.get('pos_type', 0)
    lex.prob = props.get('prob', 0)
    lex.prefix = string_store[string[:1]]
    lex.suffix = string_store[string[-3:]]
    lex.shape = string_store[orth.word_shape(string)]
    lex.flags = props.get('flags', 0)
    return lex
 cdef class Vocab:
@ -43,7 +67,7 @@ cdef class Vocab:
            mem = self.mem
        cdef unicode py_string = string.chars[:string.n]
        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
+        lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
                             self.get_lex_props(py_string))
        if mem is self.mem:
            self._map.set(string.key, lex)