* Struggling with arbitrary attr access...

2025-07-15 18:52:29 +03:00 · 2014-08-21 23:49:14 +02:00 · 2014-08-21 23:49:14 +02:00 · 811b7a6b91
commit 811b7a6b91
parent 314658b31c
12 changed files with 162 additions and 89 deletions
--- a/docs/api/cython.rst
+++ b/docs/api/cython.rst
@ -1,5 +0,0 @@
-Cython API
-==========
-
-Cheat Sheet
-----------
--- a/docs/api/extending.rst
+++ b/docs/api/extending.rst
@ -1,2 +0,0 @@
-Adding a Language
-=================
--- a/docs/api/python.rst
+++ b/docs/api/python.rst
@ -1,45 +0,0 @@
-Python API
-==========
-
-.. py:currentmodule:: spacy.en
-
-To and from unicode strings
---------------------------
-
-.. autofunction:: tokenize
-.. autofunction:: lookup
-.. autofunction:: unhash
-
-Access (Hashed) String Views
----------------------------
-
-.. autofunction:: lex_of
-.. autofunction:: norm_of
-.. autofunction:: shape_of
-.. autofunction:: last3_of
-
-Access String Properties
------------------------
-
-.. autofunction:: length_of
-.. autofunction:: first_of
-
-Check Orthographic Flags
-------------------------
-
-.. autofunction:: is_alpha
-.. autofunction:: is_digit
-.. autofunction:: is_punct
-.. autofunction:: is_space
-.. autofunction:: is_lower
-.. autofunction:: is_upper
-.. autofunction:: is_title
-.. autofunction:: is_ascii
-
-Access Distributional Information
---------------------------------
-
-.. autofunction:: prob_of
-.. autofunction:: cluster_of
-.. autofunction:: check_tag_flag
-.. autofunction:: check_dist_flag
--- a/docs/guide/intro.rst
+++ b/docs/guide/intro.rst
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,19 +1,39 @@
 from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash
-from spacy.spacy cimport Lexeme
-from spacy.spacy cimport Lexeme_addr
+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport LexID
+from spacy.lexeme cimport ClusterID

 from spacy.spacy cimport Language
 from spacy.tokens cimport Tokens
+cimport cython
+
+
+ctypedef fused AttrType:
+    ClusterID
+    StringHash
+    cython.char
+
+
+cdef enum AttrName:
+    LEX
+    FIRST
+    LENGTH
+    CLUSTER
+    NORM
+    SHAPE
+    LAST3
+


 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
+    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *

 cdef English EN

-cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef LexID lookup(unicode word) except 0
 cpdef Tokens tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -26,10 +26,8 @@ scheme in several important respects:

 Take care to ensure you training and run-time data is tokenized according to the
 same scheme. Tokenization problems are a major cause of poor performance for
-NLP tools.
-
-If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
-Treebank 3-compliant tokenizer.
+NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
+provides a fully Penn Treebank 3-compliant tokenizer.
 '''
 #The script translate_treebank_tokenization can be used to transform a treebank's
 #annotation to use one of the spacy tokenization schemes.
@ -53,8 +51,12 @@ from .lexeme import *


 cdef class English(spacy.Language):
-    cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
-        pass
+    # How to ensure the order here aligns with orthography.latin?
+    view_funcs = [
+        get_normalized,
+        get_word_shape,
+        get_last3
+    ]

    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
@ -74,6 +76,27 @@ cdef class English(spacy.Language):
                i += 1
        return i

+    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
+        cdef Lexeme* w = <Lexeme*>lex_id
+        if attr == LEX:
+            return <AttrType>w.lex
+        elif attr == FIRST:
+            return w.string[0]
+        elif attr == LENGTH:
+            return w.length
+        elif attr == CLUSTER:
+            return w.cluster
+        elif attr == NORM:
+            return w.string_views[0]
+        elif attr == SHAPE:
+            return w.string_views[1]
+        elif attr == LAST3:
+            return w.string_views[2]
+        else:
+            raise AttributeError(attr)
+
+
+

 cdef bint check_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
    return EN.tokenize(string)


-# +49 151 4336 2587
-
-
 cpdef LexID lookup(unicode string) except 0:
    """Retrieve (or create, if not found) a Lexeme ID for a string.

@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
    Returns:
        lexeme (LexID): A reference to a lexical type.
    """
-    return <Lexeme_addr>EN.lookup(string)
+    return <LexID>EN.lookup(string)


 cpdef unicode unhash(StringHash hash_value):
@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
        string (unicode): A unicode string that hashes to the hash_value.
    """
    return EN.unhash(hash_value)
+
+
+def add_string_views(view_funcs):
+    """Add a string view to existing and previous lexical entries.
+
+    Args:
+        get_view (function): A unicode --> unicode function.
+
+    Returns:
+        view_id (int): An integer key you can use to access the view.
+    """
+    pass
+
+
+def load_clusters(location):
+    """Load cluster data.
+    """
+    pass
+
+def load_unigram_probs(location):
+    """Load unigram probabilities.
+    """
+    pass
+
+def load_case_stats(location):
+    """Load case stats.
+    """
+    pass
+
+def load_tag_stats(location):
+    """Load tag statistics.
+    """
+    pass
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,6 +1,6 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
-
+cimport cython

 ctypedef int ClusterID
 ctypedef uint32_t StringHash
--- a/spacy/orthography/latin.pxd
+++ b/spacy/orthography/latin.pxd
@ -10,10 +10,9 @@ cdef enum OrthFlag:


 cdef enum:
-    LEX
-    LAST3
    NORM
    SHAPE
+    LAST3

 from spacy.lexeme cimport LexID
 from spacy.lexeme cimport StringHash
--- a/spacy/orthography/latin.pyx
+++ b/spacy/orthography/latin.pyx
@ -1,20 +1,38 @@
 # cython: embedsignature=True
+from __future__ import unicode_literals
+
 from spacy.lexeme cimport Lexeme

-def get_normalized(unicode lex):
-    if lex.isalpha() and lex.islower():
-        return lex
+def get_normalized(unicode word):
+    """Todo.
+
+    Args:
+        word (unicode)
+
+    Returns:
+        normalized (unicode)
+    """
+    if word.isalpha() and word.islower():
+        return word
    else:
-        return get_word_shape(lex)
+        return get_word_shape(word)


-def get_word_shape(unicode lex):
-    cdef size_t length = len(lex)
+def get_word_shape(unicode word):
+    """Todo.
+
+    Args:
+        word (unicode)
+
+    Returns:
+        shape (unicode)
+    """
+    cdef size_t length = len(word)
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
-    for c in lex:
+    for c in word:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
    return shape


+cpdef unicode get_last3(unicode string):
+    return string[-3:]
+
+
 cpdef bint is_alpha(LexID lex_id) except *:
-    """Give the result of unicode.isalpha() for a Lexeme ID.
+    """Check whether all characters in the word's string are alphabetic.
+    
+    Should match the :py:func:`unicode.isalpha()` function.

    >>> is_alpha(lookup(u'Hello'))
    True
@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:


 cpdef bint is_digit(LexID lex_id) except *:
-    """Give the result of unicode.isdigit() for a Lexeme ID.
+    """Check whether all characters in the word's string are numeric.
+    
+    Should match the :py:func:`unicode.isdigit()` function.

    >>> is_digit(lookup(u'10'))
    True
@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:


 cpdef bint is_punct(LexID lex_id) except *:
-    """Give the result of checking whether all characters belong to a punctuation
-    unicode data category for a Lexeme ID.
+    """Check whether all characters belong to a punctuation unicode data category
+    for a Lexeme ID.

    >>> is_punct(lookup(u'.'))
    True
@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
 cpdef bint is_space(LexID lex_id) except *:
    """Give the result of unicode.isspace() for a Lexeme ID.

-    >>> is_space(lookup(u'\t'))
+    >>> is_space(lookup(u'\\t'))
    True
    >>> is_space(lookup(u'<unicode space>'))
    True
-    >>> is_space(lookup(u'Hi\n'))
+    >>> is_space(lookup(u'Hi\\n'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
    """Return the hash of a "normalized" version of the string.

    Normalized strings are intended to be less sparse, while still capturing
-    important lexical information.  See spacy.latin.orthography.normalize_string for details of the normalization
-    function.
+    important lexical information.  See :py:func:`spacy.latin.orthography.normalize_string`
+    for details of the normalization function.

    >>> unhash(norm_of(lookupu'Hi'))
    u'hi'
@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
 cpdef StringHash shape_of(LexID lex_id) except 0:
    """Return the hash of a string describing the word's "orthograpgic shape".

-    Orthographic shapes are calculated by the spacy.orthography.latin.string_shape
+    Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
    function. Word shape features have been found useful for NER and POS tagging,
    e.g. Manning (2011)

--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -24,6 +24,7 @@ TAGS = {}
 DIST_FLAGS = {}

 cdef class Language:
+    view_funcs = []
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
@ -90,13 +91,41 @@ cdef class Language:
        cdef bytes byte_string = string.encode('utf8')
        word.string = <char*>byte_string
        word.length = len(byte_string)
-        self.set_orth(string, word)
-
        word.lex = hash(string)
+        word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
+        cdef unicode view
+        cdef StringHash hashed
+        for i, view_func in enumerate(self.view_funcs):
+            view = view_func(string)
+            hashed = hash(view)
+            word.string_views[i] = hashed
+            self.bacov[hashed] = view
        self.bacov[word.lex] = string
        self.vocab[word.lex] = <LexID>word
        return word

+    def add_view_funcs(self, list view_funcs):
+        self.view_funcs.extend(view_funcs)
+        cdef size_t nr_views = len(self.view_funcs)
+
+        cdef unicode view
+        cdef StringHash hashed
+        cdef StringHash key
+        cdef unicode string
+        cdef LexID lex_id
+        cdef Lexeme* word
+
+        for key, lex_id in self.vocab:
+            word = <Lexeme*>lex_id
+            free(word.string_views)
+            word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
+            string = word.string[:word.length].decode('utf8')
+            for i, view_func in enumerate(self.view_funcs):
+                view = view_func(string)
+                hashed = hash(view)
+                word.string_views[i] = hashed
+                self.bacov[hashed] = view
+
    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc


 from spacy.lexeme cimport Lexeme
-#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
 from spacy.spacy cimport StringHash


@ -66,8 +65,7 @@ cdef class Tokens:
        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
-            #key = attr_of(t, attr)
-            key = 0
+            key = self.lang.attr_of(t, attr)
            if key in indices:
                groups[indices[key]].append(t)
            else: