* Broken version being refactored for docs

2025-07-16 03:02:41 +03:00 · 2014-08-20 13:39:39 +02:00 · 2014-08-20 13:39:39 +02:00 · a78ad4152d
commit a78ad4152d
parent 5fddb8d165
8 changed files with 196 additions and 224 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens

 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
+    cdef int set_orth(self, unicode word, Lexeme* lex) except -1

 cdef English EN

--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -1,7 +1,8 @@
 # cython: profile=True
-'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
-so that strings can be retrieved from hashes.  Use 64-bit hash values and
-boldly assume no collisions.
+# cython: embedsignature=True
+'''Tokenize English text, allowing some differences from the Penn Treebank
+tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
+compatibility is the priority.
 '''
 from __future__ import unicode_literals

@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

-from spacy.string_tools cimport substr
-
-from . import util
-
 cimport spacy


+from spacy.orthography.latin cimport *
+
+
+
 cdef class English(spacy.Language):
+    cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
+        pass
+
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
@ -26,17 +30,17 @@ cdef class English(spacy.Language):
        if word.endswith("'s") and length >= 3:
            return length - 2
        # Leading punctuation
-        if is_punct(word, 0, length):
+        if check_punct(word, 0, length):
            return 1
        elif length >= 1:
            # Split off all trailing punctuation characters
            i = 0
-            while i < length and not is_punct(word, i, length):
+            while i < length and not check_punct(word, i, length):
                i += 1
        return i


-cdef bint is_punct(unicode word, size_t i, size_t length):
+cdef bint check_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        return i == 0
@ -55,14 +59,52 @@ EN = English('en')


 cpdef Tokens tokenize(unicode string):
+    """Tokenize a string.
+
+    Wraps EN.tokenize, where EN is an instance of the class English. The global
+    variable manages the vocabulary, and memoizes tokenization rules.
+
+    Args:
+        string (unicode): The string to be split. Must be unicode, not bytes.
+
+    Returns:
+        tokens (Tokens): A Tokens instance, managing a vector of pointers to
+        Lexeme structs. The Tokens instance supports sequence interfaces,
+        but also offers a range of sequence-level operations, which are computed
+        efficiently in Cython-space.
+    """
    return EN.tokenize(string)
 

 cpdef Lexeme_addr lookup(unicode string) except 0:
+    """Retrieve (or create) a Lexeme for a string.
+
+    Returns a Lexeme ID, which can be used via the accessor
+    methods in spacy.lexeme
+
+    Args:
+        string (unicode):  The string to be looked up. Must be unicode, not bytes.
+
+    Returns:
+        LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
+        The LexemeID is really a memory address, making dereferencing it essentially
+        free.
+    """
    return <Lexeme_addr>EN.lookup(string)


 cpdef unicode unhash(StringHash hash_value):
+    """Retrieve a string from a hash value. Mostly used for testing.
+
+    In general you should avoid computing with strings, as they are slower than
+    the intended ID-based usage. However, strings can be recovered if necessary,
+    although no control is taken for hash collisions.
+
+    Args:
+        hash_value (uint32_t): The hash of a string, returned by Python's hash()
+        function.
+
+    Returns:
+        string (unicode): A unicode string that hashes to the hash_value.
+    """
    return EN.unhash(hash_value)
-
-
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,83 +1,34 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t

-# Put these above import to avoid circular import problem
+
 ctypedef int ClusterID
 ctypedef uint32_t StringHash
-ctypedef size_t Lexeme_addr
-ctypedef char Bits8
-ctypedef uint64_t Bits64
-
-
-cdef enum OrthFlag:
-    IS_ALPHA
-    IS_DIGIT
-    IS_PUNCT
-    IS_WHITE
-    IS_LOWER
-    IS_UPPER
-    IS_TITLE
-    IS_ASCII
-
-
-cdef enum DistFlag:
-    OFT_UPPER
-    OFT_TITLE
-    DIST_FLAG3
-    DIST_FLAG4
-    DIST_FLAG5
-    DIST_FLAG6
-    DIST_FLAG7
-    DIST_FLAG8
-
-
-cdef struct Orthography:
-    StringHash shape
-    StringHash norm
-    StringHash last3
-    Bits8 flags
-
-
-cdef struct Distribution:
-    double prob
-    ClusterID cluster
-    Bits64 tagdict
-    Bits8 flags
+ctypedef size_t LexID
+ctypedef char OrthFlags
+ctypedef char DistFlags
+ctypedef uint64_t TagFlags


 cdef struct Lexeme:
+    StringHash lex
    char* string
    size_t length
-    StringHash lex
-    Orthography orth  # Extra orthographic views
-    Distribution dist # Distribution info
+    double prob
+    ClusterID cluster
+    TagFlags possible_tags
+    DistFlags dist_flags
+    OrthFlags orth_flags
+    StringHash* string_views


-cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
-    Orthography(0, 0, 0, 0),
-    Distribution(0.0, 0, 0, 0)
-)
+cpdef char first_of(LexID lex_id) except 0
+cpdef size_t length_of(LexID lex_id) except 0
+cpdef double prob_of(LexID lex_id) except 0
+cpdef ClusterID cluster_of(LexID lex_id) except 0

+cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
+cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
+cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *

-cdef enum StringAttr:
-    LEX
-    NORM
-    SHAPE
-    LAST3
-    LENGTH
-
-
-cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
-
-cpdef StringHash lex_of(size_t lex_id) except 0
-cpdef StringHash norm_of(size_t lex_id) except 0
-cpdef StringHash shape_of(size_t lex_id) except 0
-cpdef StringHash last3_of(size_t lex_id) except 0
-
-cpdef size_t length_of(size_t lex_id) except *
-
-cpdef double prob_of(size_t lex_id) except 0
-cpdef ClusterID cluster_of(size_t lex_id) except 0
-
-cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
-cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
+cpdef StringHash view_of(LexID lex_id, size_t view) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,32 +1,32 @@
 # cython: profile=True
+# cython: embedsignature=True
 '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
 '''
 from __future__ import unicode_literals

-from spacy.string_tools cimport substr
-
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
-from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash


-cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
-    if attr == LEX:
-        return lex_of(lex_id)
-    elif attr == NORM:
-        return norm_of(lex_id)
-    elif attr == SHAPE:
-        return shape_of(lex_id)
-    elif attr == LAST3:
-        return last3_of(lex_id)
-    elif attr == LENGTH:
-        return length_of(lex_id)
-    else:
-        raise StandardError
+cpdef int set_flags(LexID lex_id, object active_flags) except *:
+    """Set orthographic bit flags for a Lexeme.
+
+    Args:
+        lex_id (LexemeID): A reference ID for a Lexeme.
+        active_flags: A sequence of bits to set as True.
+    """
+    cdef size_t flag
+    cdef Lexeme* w = <Lexeme*>lex_id
+    for flag in active_flags:
+        w.orth_flags |= 1 << flag
+
+
+cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
+    return (<Lexeme*>lex_id).string_views[view]


 cpdef StringHash lex_of(size_t lex_id) except 0:
@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
    delimited tokens split off.  The other fields refer to properties of the
    string that the lex field stores a hash of, except sic and tail.

-    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    >>> from spacy import en
+    >>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')]
    [u'Hi', u'!', u'world']
    '''
    return (<Lexeme*>lex_id).lex


-cpdef StringHash norm_of(size_t lex_id) except 0:
-    '''Access the `lex' field of the Lexeme pointed to by lex_id.
-
-    The lex field is the hash of the string you would expect to get back from
-    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
-    delimited tokens split off.  The other fields refer to properties of the
-    string that the lex field stores a hash of, except sic and tail.
-
-    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
-    [u'Hi', u'!', u'world']
-    '''
-    return (<Lexeme*>lex_id).orth.norm
-
-
-cpdef StringHash shape_of(size_t lex_id) except 0:
-    return (<Lexeme*>lex_id).orth.shape
-
-
-cpdef StringHash last3_of(size_t lex_id) except 0:
-    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
-    the hash of the last three characters of the word:
-
-    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
-    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
-    [u'llo', u'!']
-    '''
-    return (<Lexeme*>lex_id).orth.last3
-
-
-cpdef ClusterID cluster_of(size_t lex_id) except 0:
+cpdef ClusterID cluster_of(LexID lex_id) except 0:
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
    which should be understood as a binary address:
@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0:
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
-    return (<Lexeme*>lex_id).dist.cluster
+    return (<Lexeme*>lex_id).cluster


-cpdef Py_UNICODE first_of(size_t lex_id):
+cpdef char first_of(size_t lex_id) except 0:
    '''Access the `first' field of the Lexeme pointed to by lex_id, which
    stores the first character of the lex string of the word.

@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
-    return (<Lexeme*>lex_id).orth.first
+    return (<Lexeme*>lex_id).string[0]


-cpdef size_t length_of(size_t lex_id) except *:
+cpdef size_t length_of(size_t lex_id) except 0:
    '''Access the `length' field of the Lexeme pointed to by lex_id, which stores
    the length of the string hashed by lex_of.'''
    cdef Lexeme* word = <Lexeme*>lex_id
@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0:
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
-    return (<Lexeme*>lex_id).dist.prob
+    return (<Lexeme*>lex_id).prob

+DEF OFT_UPPER = 1
+DEF OFT_TITLE = 2

 cpdef bint is_oft_upper(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id):
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
-    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
+    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)


 cpdef bint is_oft_title(size_t lex_id):
@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id):
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
-    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
+    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)

-cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
-    return (<Lexeme*>lex_id).orth.flags & (1 << flag)
+cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
+    return (<Lexeme*>lex_id).orth_flags & (1 << flag)


-cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
-    return (<Lexeme*>lex_id).dist.flags & (1 << flag)
+cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
+    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
+
+
+cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
+    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -19,8 +19,6 @@ ctypedef int ClusterID


 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Distribution
-from spacy.lexeme cimport Orthography


 cdef class Language:
@ -29,7 +27,7 @@ cdef class Language:
    cdef dense_hash_map[StringHash, size_t] vocab
    cdef dict bacov

-    cdef Tokens tokenize(self, unicode text)
+    cpdef Tokens tokenize(self, unicode text)

    cdef Lexeme* lookup(self, unicode string) except NULL
    cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
@ -37,7 +35,8 @@ cdef class Language:
    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
    
-    cdef unicode unhash(self, StringHash hashed)
+    cpdef unicode unhash(self, StringHash hashed)
    
-    cpdef list find_substrings(self, unicode word)
+    cpdef list find_substrings(self, unicode chunk)
    cdef int find_split(self, unicode word)
+    cdef int set_orth(self, unicode string, Lexeme* word)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -1,4 +1,13 @@
 # cython: profile=True
+# cython: embedsignature=True
+"""Common classes and utilities across languages.
+
+Provides the main implementation for the spacy tokenizer. Specific languages
+subclass the Language class, over-writing the tokenization rules as necessary.
+Special-case tokenization rules are read from data/<lang>/tokenization .
+"""
+
+ 
 from __future__ import unicode_literals

 from libc.stdlib cimport calloc, free
@ -6,54 +15,13 @@ from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref

 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport BLANK_WORD
-
-from spacy.string_tools cimport substr
+from spacy.lexeme cimport LexID

 from . import util
 from os import path

-DIST_FLAGS = {}
 TAGS = {}
-
-
-def get_normalized(unicode lex):
-    if lex.isalpha() and lex.islower():
-        return lex
-    else:
-        return get_word_shape(lex)
-
-
-def get_word_shape(unicode lex):
-    cdef size_t length = len(lex)
-    shape = ""
-    last = ""
-    shape_char = ""
-    seq = 0
-    for c in lex:
-        if c.isalpha():
-            if c.isupper():
-                shape_char = "X"
-            else:
-                shape_char = "x"
-        elif c.isdigit():
-            shape_char = "d"
-        else:
-            shape_char = c
-        if shape_char == last:
-            seq += 1
-        else:
-            seq = 0
-            last = shape_char
-        if seq < 3:
-            shape += shape_char
-    assert shape
-    return shape
-
-
-def set_orth_flags(lex):
-    return 0
-
+DIST_FLAGS = {}

 cdef class Language:
    def __cinit__(self, name):
@ -64,9 +32,19 @@ cdef class Language:
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
-        #self.load_dist_info(util.read_dist_info(name))
+        self.load_dist_info(util.read_dist_info(name))

-    cdef Tokens tokenize(self, unicode string):
+    cpdef Tokens tokenize(self, unicode string):
+        """Tokenize.
+
+        Split the string into tokens.
+
+        Args:
+            string (unicode): The string to split.
+
+        Returns:
+            tokens (Tokens): A Tokens object.
+        """
        cdef Lexeme** chunk
        cdef Tokens tokens = Tokens(self)
        cdef size_t length = len(string)
@ -85,8 +63,7 @@ cdef class Language:
        return tokens

    cdef Lexeme* lookup(self, unicode string) except NULL:
-        if len(string) == 0:
-            return &BLANK_WORD
+        assert len(string) != 0
        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
        if word == NULL:
            word = self.new_lexeme(string)
@ -113,56 +90,79 @@ cdef class Language:
        cdef bytes byte_string = string.encode('utf8')
        word.string = <char*>byte_string
        word.length = len(byte_string)
-        word.orth.flags = set_orth_flags(string)
-        cdef unicode norm = get_normalized(string)
-        cdef unicode shape = get_word_shape(string)
-        cdef unicode last3 = string[-3:]
-        word.lex = hash(string)
-        word.orth.norm = hash(norm)
-        word.orth.shape = hash(shape)
-        word.orth.last3 = hash(last3)
-        self.bacov[word.lex] = string
-        self.bacov[word.orth.norm] = norm
-        self.bacov[word.orth.shape] = shape
-        self.bacov[word.orth.last3] = last3
+        self.set_orth(string, word)

-        self.vocab[hash(string)] = <size_t>word
+        word.lex = hash(string)
+        self.bacov[word.lex] = string
+        self.vocab[word.lex] = <LexID>word
        return word

-    cdef unicode unhash(self, StringHash hash_value):
+    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]

-    cpdef list find_substrings(self, unicode word):
+    cpdef list find_substrings(self, unicode chunk):
+        """Find how to split a chunk into substrings.
+
+        This method calls find_split repeatedly. Most languages will want to
+        override find_split, but it may be useful to override this instead.
+
+        Args:
+            chunk (unicode): The string to be split, e.g. u"Mike's!"
+
+        Returns:
+            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
+        """
        substrings = []
-        while word:
-            split = self.find_split(word)
+        while chunk:
+            split = self.find_split(chunk)
            if split == 0:
-                substrings.append(word)
+                substrings.append(chunk)
                break
-            substrings.append(word[:split])
-            word = word[split:]
+            substrings.append(chunk[:split])
+            chunk = chunk[split:]
        return substrings

    cdef int find_split(self, unicode word):
        return len(word)

-    def load_tokenization(self, token_rules=None):
+    cdef int set_orth(self, unicode string, Lexeme* word):
+        pass
+
+    def load_tokenization(self, token_rules):
+        '''Load special-case tokenization rules.
+
+        Loads special-case tokenization rules into the Language.chunk cache,
+        read from data/<lang>/tokenization . The special cases are loaded before
+        any language data is tokenized, giving these priority.  For instance,
+        the English tokenization rules map "ain't" to ["are", "not"].
+
+        Args:
+            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
+                a string and tokens is a list of strings.
+        '''
        for chunk, tokens in token_rules:
            self.new_chunk(chunk, tokens)

    def load_dist_info(self, dist_info):
+        '''Load distributional information for the known lexemes of the language.
+
+        The distributional information is read from data/<lang>/dist_info.json .
+        It contains information like the (smoothed) unigram log probability of
+        the word, how often the word is found upper-cased, how often the word
+        is found title-cased, etc.
+        '''
        cdef unicode string
        cdef dict word_dist
        cdef Lexeme* w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
-            w.dist.prob = word_dist.prob
-            w.dist.cluster = word_dist.cluster
+            w.prob = word_dist.prob
+            w.cluster = word_dist.cluster
            for flag in word_dist.flags:
-                w.dist.flags |= DIST_FLAGS[flag]
+                w.dist_flags |= DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
-                w.dist.tagdict |= TAGS[tag]
+                w.possible_tags |= TAGS[tag]


 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme

 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
-from spacy.lexeme cimport StringAttr


 cdef class Tokens:
@ -15,5 +14,5 @@ cdef class Tokens:
    cpdef int append(self, Lexeme_addr token)
    cpdef int extend(self, Tokens other) except -1
    
-    cpdef object group_by(self, StringAttr attr)
-    cpdef dict count_by(self, StringAttr attr)
+    cpdef object group_by(self, size_t attr)
+    cpdef dict count_by(self, size_t attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc


 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
+#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
 from spacy.spacy cimport StringHash


@ -37,7 +37,7 @@ cdef class Tokens:
        for el in other:
            self.append(el)

-    cpdef object group_by(self, StringAttr attr):
+    cpdef object group_by(self, size_t attr):
        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
        
@ -66,7 +66,8 @@ cdef class Tokens:
        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
-            key = attr_of(t, attr)
+            #key = attr_of(t, attr)
+            key = 0
            if key in indices:
                groups[indices[key]].append(t)
            else:
@ -77,12 +78,13 @@ cdef class Tokens:
                groups[-1].append(t)
        return names, hashes, groups

-    cpdef dict count_by(self, StringAttr attr):
+    cpdef dict count_by(self, size_t attr):
        counts = {}
        cdef Lexeme_addr t
        cdef StringHash key
        for t in self.vctr[0]:
-            key = attr_of(t, attr)
+            #key = attr_of(t, attr)
+            key = 0
            if key not in counts:
                counts[key] = 0
            counts[key] += 1