* Tmp. Working on refactor. Compiles, must hook up lexical feats.

2025-12-14 05:34:16 +03:00 · 2015-01-14 00:03:48 +11:00 · 2015-01-14 00:03:48 +11:00 · 0930892fc1
commit 0930892fc1
parent 46da3d74d2
9 changed files with 150 additions and 196 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -12,7 +12,10 @@ from .attrs import get_flags


 def get_lex_props(string):
-    return {'flags': get_flags(string), 'dense': 1}
+    return {'flags': get_flags(string), 'length': len(string),
+            'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
+            'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
+            'sentiment': 0}

 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

@ -45,7 +48,7 @@ class English(object):
    """
    def __init__(self, data_dir=LOCAL_DATA_DIR):
        self._data_dir = data_dir
-        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
+        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                           get_lex_props=get_lex_props)
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -283,12 +283,12 @@ cdef class EnPosTagger:
    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
-        cdef bytes py_string = self.strings[lex.sic]
+        cdef unicode py_string = self.strings[lex.sic]
        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
        cdef set lemma_strings
        cdef unicode lemma_string
-        lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos)
+        lemma_strings = self.lemmatizer(py_string, pos)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
        return lemma
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -7,9 +7,7 @@ from .strings cimport StringStore
 cdef LexemeC EMPTY_LEXEME


-cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
-                  dict props) except *
- 
+cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
 
 cdef class Lexeme:
    cdef const float* vec
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64
 from libc.string cimport memset

 from .orth cimport word_shape
+from .typedefs cimport attr_t


 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


-cdef LexemeC init(id_t i, unicode string, hash_t hashed,
-                  StringStore string_store, dict props) except *:
-    cdef LexemeC lex
-    lex.id = i
-    lex.length = len(string)
-    lex.sic = string_store[string]
+cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:

-    lex.cluster = props.get('cluster', 0)
-    lex.prob = props.get('prob', 0)
+    lex.length = props['length']
+    lex.sic = string_store[props['sic']]
+    lex.norm1 = string_store[props['norm1']] 
+    lex.norm2 = string_store[props['norm2']] 
+    lex.shape = string_store[props['shape']] 
+    lex.prefix = string_store[props['prefix']]
+    lex.suffix = string_store[props['suffix']]
    
-    lex.prefix = string_store[string[:1]]
-    lex.suffix = string_store[string[-3:]]
-    lex.shape = string_store[word_shape(string)]
+    lex.cluster = props['cluster']
+    lex.prob = props['prob']
+    lex.sentiment = props['sentiment']

-    lex.flags = props.get('flags', 0)
-    return lex
+    lex.flags = props['flags']


 cdef class Lexeme:
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -67,7 +67,7 @@ cdef class StringStore:
            if string_or_id < 1 or string_or_id >= self.size:
                raise IndexError(string_or_id)
            utf8str = &self.strings[<int>string_or_id]
-            return utf8str.chars[:utf8str.length]
+            return utf8str.chars[:utf8str.length].decode('utf8')
        elif isinstance(string_or_id, bytes):
            utf8str = self.intern(<char*>string_or_id, len(string_or_id))
            return utf8str.i
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -42,32 +42,5 @@ cdef class Tokens:


 cdef class Token:
-    cdef cvarray vec
-
-    cdef readonly flags_t flags
-   
-    cdef readonly attr_t id
-    cdef readonly attr_t sic
-    cdef readonly attr_t dense
-    cdef readonly attr_t shape
-    cdef readonly attr_t prefix
-    cdef readonly attr_t suffix
- 
-    cdef readonly attr_t length
-    cdef readonly attr_t cluster
-    cdef readonly attr_t pos_type
-
-    cdef readonly float prob
-    cdef readonly float sentiment
-
-    cdef readonly Morphology morph
-    cdef readonly univ_tag_t pos
-    cdef readonly int fine_pos
-    cdef readonly int idx
-    cdef readonly int lemma
-    cdef readonly int sense
-    cdef readonly int dep_tag
-    
-    cdef readonly int head_offset
-    cdef readonly uint32_t l_kids
-    cdef readonly uint32_t r_kids
+    cdef readonly Tokens _seq
+    cdef readonly int i
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -85,7 +85,7 @@ cdef class Tokens:
            token (Token):
        """
        bounds_check(i, self.length, PADDING)
-        return cinit_token(&self.data[i])
+        return Token(self, i)

    def __iter__(self):
        """Iterate over the tokens.
@ -174,38 +174,26 @@ cdef class Tokens:
            self.data[i].lex = &EMPTY_LEXEME


-cdef Token cinit_token(const TokenC* c_tok):
-    cdef Token py_tok = Token.__new__(Token)
-    py_tok.morph = c_tok.morph
-    py_tok.pos = c_tok.pos
-    py_tok.fine_pos = c_tok.fine_pos
-    py_tok.idx = c_tok.idx
-    py_tok.lemma = c_tok.lemma
-    py_tok.sense = c_tok.sense
-    py_tok.dep_tag = c_tok.dep_tag
-    py_tok.head_offset = c_tok.head
-    py_tok.l_kids = c_tok.l_kids
-    py_tok.r_kids = c_tok.r_kids
-    return py_tok
-
-
+@cython.freelist(64)
 cdef class Token:
    """An individual token.
-    """
-    def __init__(self):
-        pass
-        #self._seq = tokens
-        #self.i = i

-    #def __unicode__(self):
-    #    cdef const TokenC* t = &self._seq.data[self.i]
-    #    cdef int end_idx = t.idx + t.lex.length
-    #    if self.i + 1 == self._seq.length:
-    #        return self.string
-    #    if end_idx == t[1].idx:
-    #        return self.string
-    #    else:
-    #        return self.string + ' '
+    Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
+    object.
+    """
+    def __init__(self, Tokens tokens, int i):
+        self._seq = tokens
+        self.i = i
+
+    def __unicode__(self):
+        cdef const TokenC* t = &self._seq.data[self.i]
+        cdef int end_idx = t.idx + t.lex.length
+        if self.i + 1 == self._seq.length:
+            return self.string
+        if end_idx == t[1].idx:
+            return self.string
+        else:
+            return self.string + ' '

    def __len__(self):
        """The number of unicode code-points in the original string.
@ -213,87 +201,87 @@ cdef class Token:
        Returns:
            length (int):
        """
-        return self.length
+        return self._seq.data[self.i].lex.length

-    #property idx:
-    #    """The index into the original string at which the token starts.
+    property idx:
+        """The index into the original string at which the token starts.

-    #    The following is supposed to always be true:
-    #    
-    #    >>> original_string[token.idx:token.idx len(token) == token.string
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].idx
+        The following is supposed to always be true:
        
-    #property cluster:
-    #    """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
-    #
-    #    Similar words have better-than-chance likelihood of having similar cluster
-    #    IDs, although the clustering is quite noisy.  Cluster IDs make good features,
-    #    and help to make models slightly more robust to domain variation.
+        >>> original_string[token.idx:token.idx len(token) == token.string
+        """
+        def __get__(self):
+            return self._seq.data[self.i].idx

-    #    A common trick is to use only the first N bits of a cluster ID in a feature,
-    #    as the more general part of the hierarchical clustering is often more accurate
-    #    than the lower categories.
+    property cluster:
+        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
    
-    #    To assist in this, I encode the cluster IDs little-endian, to allow a simple
-    #    bit-mask:
+        Similar words have better-than-chance likelihood of having similar cluster
+        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
+        and help to make models slightly more robust to domain variation.

-    #    >>> six_bits = cluster & (2**6 - 1)
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].lex.cluster
+        A common trick is to use only the first N bits of a cluster ID in a feature,
+        as the more general part of the hierarchical clustering is often more accurate
+        than the lower categories.

-    #property string:
-    #    """The unicode string of the word, with no whitespace padding."""
-    #    def __get__(self):
-    #        cdef const TokenC* t = &self._seq.data[self.i]
-    #        if t.lex.sic == 0:
-    #            return ''
-    #        cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
-    #        return utf8string.decode('utf8')
+        To assist in this, I encode the cluster IDs little-endian, to allow a simple
+        bit-mask:

-    #property lemma:
-    #    """The unicode string of the word's lemma.  If no part-of-speech tag is
-    #    assigned, the most common part-of-speech tag of the word is used.
-    #    """
-    #    def __get__(self):
-    #        cdef const TokenC* t = &self._seq.data[self.i]
-    #        if t.lemma == 0:
-    #            return self.string
-    #        cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
-    #        return utf8string.decode('utf8')
+        >>> six_bits = cluster & (2**6 - 1)
+        """
+        def __get__(self):
+            return self._seq.data[self.i].lex.cluster

-    #property dep_tag:
-    #    """The ID integer of the word's dependency label.  If no parse has been
-    #    assigned, defaults to 0.
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].dep_tag
+    property string:
+        """The unicode string of the word, with no whitespace padding."""
+        def __get__(self):
+            cdef const TokenC* t = &self._seq.data[self.i]
+            if t.lex.sic == 0:
+                return ''
+            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
+            return py_ustr

-    #property pos:
-    #    """The ID integer of the word's part-of-speech tag, from the 13-tag
-    #    Google Universal Tag Set.  Constants for this tag set are available in
-    #    spacy.typedefs.
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].pos
+    property lemma:
+        """The unicode string of the word's lemma.  If no part-of-speech tag is
+        assigned, the most common part-of-speech tag of the word is used.
+        """
+        def __get__(self):
+            cdef const TokenC* t = &self._seq.data[self.i]
+            if t.lemma == 0:
+                return self.string
+            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
+            return py_ustr

-    #property fine_pos:
-    #    """The ID integer of the word's fine-grained part-of-speech tag, as assigned
-    #    by the tagger model.  Fine-grained tags include morphological information,
-    #    and other distinctions, and allow a more accurate tagger to be trained.
-    #    """
+    property dep_tag:
+        """The ID integer of the word's dependency label.  If no parse has been
+        assigned, defaults to 0.
+        """
+        def __get__(self):
+            return self._seq.data[self.i].dep_tag

-    #    def __get__(self):
-    #        return self._seq.data[self.i].fine_pos
+    property pos:
+        """The ID integer of the word's part-of-speech tag, from the 13-tag
+        Google Universal Tag Set.  Constants for this tag set are available in
+        spacy.typedefs.
+        """
+        def __get__(self):
+            return self._seq.data[self.i].pos

-    #property sic:
-    #    def __get__(self):
-    #        return self._seq.data[self.i].lex.sic
+    property fine_pos:
+        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
+        by the tagger model.  Fine-grained tags include morphological information,
+        and other distinctions, and allow a more accurate tagger to be trained.
+        """
 
-    #property head:
-    #    """The token predicted by the parser to be the head of the current token."""
-    #    def __get__(self):
-    #        cdef const TokenC* t = &self._seq.data[self.i]
-    #        return Token(self._seq, self.i + t.head)
+        def __get__(self):
+            return self._seq.data[self.i].fine_pos
+
+    property sic:
+        def __get__(self):
+            return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
+
+    property head:
+        """The token predicted by the parser to be the head of the current token."""
+        def __get__(self):
+            cdef const TokenC* t = &self._seq.data[self.i]
+            return Token(self._seq, self.i + t.head)
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -24,12 +24,13 @@ cdef struct _Cached:


 cdef class Vocab:
-    cpdef public get_lex_props
+    cpdef public lexeme_props_getter
    cdef Pool mem
    cpdef readonly StringStore strings
-    cdef vector[LexemeC*] lexemes
+    cdef vector[const LexemeC*] lexemes

    cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
    
    cdef PreshMap _map
  
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -5,7 +5,7 @@ from os import path
 import codecs

 from .lexeme cimport EMPTY_LEXEME
-from .lexeme cimport init as lexeme_init
+from .lexeme cimport set_lex_struct_props
 from .lexeme cimport Lexeme_cinit
 from .strings cimport slice_unicode
 from .strings cimport hash_string
@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.vec = EMPTY_VEC


-cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
-                  StringStore string_store, dict props) except *:
-    cdef LexemeC lex
-    lex.id = i
-    lex.length = len(string)
-    lex.sic = string_store[string]
-    
-    lex.cluster = props.get('cluster', 0)
-    lex.prob = props.get('prob', 0)
-
-    lex.prefix = string_store[string[:1]]
-    lex.suffix = string_store[string[-3:]]
-    lex.shape = string_store[word_shape(string)]
-   
-    lex.flags = props.get('flags', 0)
-    return lex
-
-
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
@ -47,7 +29,7 @@ cdef class Vocab:
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
-        self.get_lex_props = get_lex_props
+        self.lexeme_props_getter = get_lex_props

        if data_dir is not None:
            if not path.exists(data_dir):
@ -63,32 +45,36 @@ cdef class Vocab:
        """The current number of lexemes stored."""
        return self.lexemes.size()

-    cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
+    cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
        cdef LexemeC* lex
-        lex = <LexemeC*>self._map.get(string.key)
+        lex = <LexemeC*>self._map.get(c_str.key)
        if lex != NULL:
            return lex
-        if string.n < 3:
+        if c_str.n < 3:
            mem = self.mem
-        cdef unicode py_string = string.chars[:string.n]
+        cdef unicode py_str = c_str.chars[:c_str.n]
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
-                             self.get_lex_props(py_string))
+        props = self.lexeme_props_getter(py_str)
+        set_lex_struct_props(lex, props, self.strings)
        if mem is self.mem:
-            self._map.set(string.key, lex)
+            lex.id = self.lexemes.size()
+            self._add_lex_to_vocab(c_str.key, lex)
+        else:
+            lex.id = 1
+        return lex
+
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+        self._map.set(key, <void*>lex)
        while self.lexemes.size() < (lex.id + 1):
            self.lexemes.push_back(&EMPTY_LEXEME)
        self.lexemes[lex.id] = lex
-        else:
-            lex[0].id = 1
-        return lex

    def __getitem__(self,  id_or_string):
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
-        unseen unicode string is given, a new LexemeC is created and stored.
+        unseen unicode string is given, a new lexeme is created and stored.

        Args:
            id_or_string (int or unicode): The integer ID of a word, or its unicode
@ -100,24 +86,28 @@ cdef class Vocab:
            lexeme (Lexeme): An instance of the Lexeme Python class, with data
                copied on instantiation.
        '''
-        cdef UniStr string
+        cdef UniStr c_str
        cdef const LexemeC* lexeme
        if type(id_or_string) == int:
            if id_or_string >= self.lexemes.size():
                raise IndexError
            lexeme = self.lexemes.at(id_or_string)
        else:
-            slice_unicode(&string, id_or_string, 0, len(id_or_string))
-            lexeme = self.get(self.mem, &string)
+            slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
+            lexeme = self.get(self.mem, &c_str)
        return Lexeme_cinit(lexeme, self.strings)

-    def __setitem__(self, unicode uni_string, dict props):
-        cdef UniStr s
-        slice_unicode(&s, uni_string, 0, len(uni_string))
-        # Cast through the const here, since we're allowed to change our own
-        # LexemeCs.
-        lex = <LexemeC*><void*>self.get(self.mem, &s)
-        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
+    def __setitem__(self, unicode py_str, dict props):
+        cdef UniStr c_str
+        slice_unicode(&c_str, py_str, 0, len(py_str))
+        cdef LexemeC* lex
+        lex = <LexemeC*>self._map.get(c_str.key)
+        if lex == NULL:
+            lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            lex.id = self.lexemes.size()
+            self._add_lex_to_vocab(c_str.key, lex)
+        set_lex_struct_props(lex, props, self.strings)
+        assert lex.sic < 1000000

    def dump(self, loc):
        if path.exists(loc):
@ -154,6 +144,7 @@ cdef class Vocab:
            if st != 1:
                break
            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            lexeme.vec = EMPTY_VEC
            st = fread(lexeme, sizeof(LexemeC), 1, fp)
            if st != 1:
                break