* Struggling with arbitrary attr access...

2025-10-29 06:57:49 +03:00 · 2014-08-21 23:49:14 +02:00 · 2014-08-21 23:49:14 +02:00 · 811b7a6b91
commit 811b7a6b91
parent 314658b31c
12 changed files with 162 additions and 89 deletions
--- a/docs/api/cython.rst
+++ b/docs/api/cython.rst
@ -1,5 +0,0 @@
 Cython API
 ==========
 Cheat Sheet
 -----------
--- a/docs/api/extending.rst
+++ b/docs/api/extending.rst
@ -1,2 +0,0 @@
 Adding a Language
 =================
--- a/docs/api/python.rst
+++ b/docs/api/python.rst
@ -1,45 +0,0 @@
 Python API
 ==========
 .. py:currentmodule:: spacy.en
 To and from unicode strings
 ---------------------------
 .. autofunction:: tokenize
 .. autofunction:: lookup
 .. autofunction:: unhash
 Access (Hashed) String Views
 ----------------------------
 .. autofunction:: lex_of
 .. autofunction:: norm_of
 .. autofunction:: shape_of
 .. autofunction:: last3_of
 Access String Properties
 ------------------------
 .. autofunction:: length_of
 .. autofunction:: first_of
 Check Orthographic Flags
 -------------------------
 .. autofunction:: is_alpha
 .. autofunction:: is_digit
 .. autofunction:: is_punct
 .. autofunction:: is_space
 .. autofunction:: is_lower
 .. autofunction:: is_upper
 .. autofunction:: is_title
 .. autofunction:: is_ascii
 Access Distributional Information
 ---------------------------------
 .. autofunction:: prob_of
 .. autofunction:: cluster_of
 .. autofunction:: check_tag_flag
 .. autofunction:: check_dist_flag
--- a/docs/guide/intro.rst
+++ b/docs/guide/intro.rst
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,19 +1,39 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
-from spacy.spacy cimport Lexeme
+from spacy.lexeme cimport Lexeme
-from spacy.spacy cimport Lexeme_addr
+from spacy.lexeme cimport LexID
 from spacy.lexeme cimport ClusterID
 from spacy.spacy cimport Language
 from spacy.tokens cimport Tokens
 cimport cython
 ctypedef fused AttrType:
    ClusterID
    StringHash
    cython.char
 cdef enum AttrName:
    LEX
    FIRST
    LENGTH
    CLUSTER
    NORM
    SHAPE
    LAST3
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
 cdef English EN
-cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef LexID lookup(unicode word) except 0
 cpdef Tokens tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -26,10 +26,8 @@ scheme in several important respects:
 Take care to ensure you training and run-time data is tokenized according to the
 same scheme. Tokenization problems are a major cause of poor performance for
-NLP tools.
+NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
-
+provides a fully Penn Treebank 3-compliant tokenizer.
 If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
 Treebank 3-compliant tokenizer.
 '''
 #The script translate_treebank_tokenization can be used to transform a treebank's
 #annotation to use one of the spacy tokenization schemes.
@ -53,8 +51,12 @@ from .lexeme import *
 cdef class English(spacy.Language):
-    cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
+    # How to ensure the order here aligns with orthography.latin?
-        pass
+    view_funcs = [
        get_normalized,
        get_word_shape,
        get_last3
    ]
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
@ -74,6 +76,27 @@ cdef class English(spacy.Language):
                i += 1
        return i
    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
        cdef Lexeme* w = <Lexeme*>lex_id
        if attr == LEX:
            return <AttrType>w.lex
        elif attr == FIRST:
            return w.string[0]
        elif attr == LENGTH:
            return w.length
        elif attr == CLUSTER:
            return w.cluster
        elif attr == NORM:
            return w.string_views[0]
        elif attr == SHAPE:
            return w.string_views[1]
        elif attr == LAST3:
            return w.string_views[2]
        else:
            raise AttributeError(attr)
 cdef bint check_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
    return EN.tokenize(string)
 # +49 151 4336 2587
 cpdef LexID lookup(unicode string) except 0:
    """Retrieve (or create, if not found) a Lexeme ID for a string.
@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
    Returns:
        lexeme (LexID): A reference to a lexical type.
    """
-    return <Lexeme_addr>EN.lookup(string)
+    return <LexID>EN.lookup(string)
 cpdef unicode unhash(StringHash hash_value):
@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
        string (unicode): A unicode string that hashes to the hash_value.
    """
    return EN.unhash(hash_value)
 def add_string_views(view_funcs):
    """Add a string view to existing and previous lexical entries.
    Args:
        get_view (function): A unicode --> unicode function.
    Returns:
        view_id (int): An integer key you can use to access the view.
    """
    pass
 def load_clusters(location):
    """Load cluster data.
    """
    pass
 def load_unigram_probs(location):
    """Load unigram probabilities.
    """
    pass
 def load_case_stats(location):
    """Load case stats.
    """
    pass
 def load_tag_stats(location):
    """Load tag statistics.
    """
    pass
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,6 +1,6 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
-
+cimport cython
 ctypedef int ClusterID
 ctypedef uint32_t StringHash
--- a/spacy/orthography/latin.pxd
+++ b/spacy/orthography/latin.pxd
@ -10,10 +10,9 @@ cdef enum OrthFlag:
 cdef enum:
    LEX
    LAST3
    NORM
    SHAPE
    LAST3
 from spacy.lexeme cimport LexID
 from spacy.lexeme cimport StringHash
--- a/spacy/orthography/latin.pyx
+++ b/spacy/orthography/latin.pyx
@ -1,20 +1,38 @@
 # cython: embedsignature=True
 from __future__ import unicode_literals
 from spacy.lexeme cimport Lexeme
-def get_normalized(unicode lex):
+def get_normalized(unicode word):
-    if lex.isalpha() and lex.islower():
+    """Todo.
-        return lex
+
    Args:
        word (unicode)
    Returns:
        normalized (unicode)
    """
    if word.isalpha() and word.islower():
        return word
    else:
-        return get_word_shape(lex)
+        return get_word_shape(word)
-def get_word_shape(unicode lex):
+def get_word_shape(unicode word):
-    cdef size_t length = len(lex)
+    """Todo.
    Args:
        word (unicode)
    Returns:
        shape (unicode)
    """
    cdef size_t length = len(word)
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
-    for c in lex:
+    for c in word:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
    return shape
 cpdef unicode get_last3(unicode string):
    return string[-3:]
 cpdef bint is_alpha(LexID lex_id) except *:
-    """Give the result of unicode.isalpha() for a Lexeme ID.
+    """Check whether all characters in the word's string are alphabetic.
    Should match the :py:func:`unicode.isalpha()` function.
    >>> is_alpha(lookup(u'Hello'))
    True
@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:
 cpdef bint is_digit(LexID lex_id) except *:
-    """Give the result of unicode.isdigit() for a Lexeme ID.
+    """Check whether all characters in the word's string are numeric.
    Should match the :py:func:`unicode.isdigit()` function.
    >>> is_digit(lookup(u'10'))
    True
@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:
 cpdef bint is_punct(LexID lex_id) except *:
-    """Give the result of checking whether all characters belong to a punctuation
+    """Check whether all characters belong to a punctuation unicode data category
-    unicode data category for a Lexeme ID.
+    for a Lexeme ID.
    >>> is_punct(lookup(u'.'))
    True
@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
 cpdef bint is_space(LexID lex_id) except *:
    """Give the result of unicode.isspace() for a Lexeme ID.
-    >>> is_space(lookup(u'\t'))
+    >>> is_space(lookup(u'\\t'))
    True
    >>> is_space(lookup(u'<unicode space>'))
    True
-    >>> is_space(lookup(u'Hi\n'))
+    >>> is_space(lookup(u'Hi\\n'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
    """Return the hash of a "normalized" version of the string.
    Normalized strings are intended to be less sparse, while still capturing
-    important lexical information.  See spacy.latin.orthography.normalize_string for details of the normalization
+    important lexical information.  See :py:func:`spacy.latin.orthography.normalize_string`
-    function.
+    for details of the normalization function.
    >>> unhash(norm_of(lookupu'Hi'))
    u'hi'
@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
 cpdef StringHash shape_of(LexID lex_id) except 0:
    """Return the hash of a string describing the word's "orthograpgic shape".
-    Orthographic shapes are calculated by the spacy.orthography.latin.string_shape
+    Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
    function. Word shape features have been found useful for NER and POS tagging,
    e.g. Manning (2011)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -24,6 +24,7 @@ TAGS = {}
 DIST_FLAGS = {}
 cdef class Language:
    view_funcs = []
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
@ -90,13 +91,41 @@ cdef class Language:
        cdef bytes byte_string = string.encode('utf8')
        word.string = <char*>byte_string
        word.length = len(byte_string)
        self.set_orth(string, word)
        word.lex = hash(string)
        word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
        cdef unicode view
        cdef StringHash hashed
        for i, view_func in enumerate(self.view_funcs):
            view = view_func(string)
            hashed = hash(view)
            word.string_views[i] = hashed
            self.bacov[hashed] = view
        self.bacov[word.lex] = string
        self.vocab[word.lex] = <LexID>word
        return word
    def add_view_funcs(self, list view_funcs):
        self.view_funcs.extend(view_funcs)
        cdef size_t nr_views = len(self.view_funcs)
        cdef unicode view
        cdef StringHash hashed
        cdef StringHash key
        cdef unicode string
        cdef LexID lex_id
        cdef Lexeme* word
        for key, lex_id in self.vocab:
            word = <Lexeme*>lex_id
            free(word.string_views)
            word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
            string = word.string[:word.length].decode('utf8')
            for i, view_func in enumerate(self.view_funcs):
                view = view_func(string)
                hashed = hash(view)
                word.string_views[i] = hashed
                self.bacov[hashed] = view
    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc
 from spacy.lexeme cimport Lexeme
 #from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
 from spacy.spacy cimport StringHash
@ -66,8 +65,7 @@ cdef class Tokens:
        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
-            #key = attr_of(t, attr)
+            key = self.lang.attr_of(t, attr)
            key = 0
            if key in indices:
                groups[indices[key]].append(t)
            else: