* Broken version being refactored for docs

2025-07-03 03:13:08 +03:00 · 2014-08-20 13:39:39 +02:00 · 2014-08-20 13:39:39 +02:00 · a78ad4152d
commit a78ad4152d
parent 5fddb8d165
8 changed files with 196 additions and 224 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
 cdef English EN
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -1,7 +1,8 @@
 # cython: profile=True
-'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
+# cython: embedsignature=True
-so that strings can be retrieved from hashes.  Use 64-bit hash values and
+'''Tokenize English text, allowing some differences from the Penn Treebank
-boldly assume no collisions.
+tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
 compatibility is the priority.
 '''
 from __future__ import unicode_literals
@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.string_tools cimport substr
 from . import util
 cimport spacy
 from spacy.orthography.latin cimport *
 cdef class English(spacy.Language):
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
        pass
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
@ -26,17 +30,17 @@ cdef class English(spacy.Language):
        if word.endswith("'s") and length >= 3:
            return length - 2
        # Leading punctuation
-        if is_punct(word, 0, length):
+        if check_punct(word, 0, length):
            return 1
        elif length >= 1:
            # Split off all trailing punctuation characters
            i = 0
-            while i < length and not is_punct(word, i, length):
+            while i < length and not check_punct(word, i, length):
                i += 1
        return i
-cdef bint is_punct(unicode word, size_t i, size_t length):
+cdef bint check_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        return i == 0
@ -55,14 +59,52 @@ EN = English('en')
 cpdef Tokens tokenize(unicode string):
    """Tokenize a string.
    Wraps EN.tokenize, where EN is an instance of the class English. The global
    variable manages the vocabulary, and memoizes tokenization rules.
    Args:
        string (unicode): The string to be split. Must be unicode, not bytes.
    Returns:
        tokens (Tokens): A Tokens instance, managing a vector of pointers to
        Lexeme structs. The Tokens instance supports sequence interfaces,
        but also offers a range of sequence-level operations, which are computed
        efficiently in Cython-space.
    """
    return EN.tokenize(string)
 cpdef Lexeme_addr lookup(unicode string) except 0:
    """Retrieve (or create) a Lexeme for a string.
    Returns a Lexeme ID, which can be used via the accessor
    methods in spacy.lexeme
    Args:
        string (unicode):  The string to be looked up. Must be unicode, not bytes.
    Returns:
        LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
        The LexemeID is really a memory address, making dereferencing it essentially
        free.
    """
    return <Lexeme_addr>EN.lookup(string)
 cpdef unicode unhash(StringHash hash_value):
    """Retrieve a string from a hash value. Mostly used for testing.
    In general you should avoid computing with strings, as they are slower than
    the intended ID-based usage. However, strings can be recovered if necessary,
    although no control is taken for hash collisions.
    Args:
        hash_value (uint32_t): The hash of a string, returned by Python's hash()
        function.
    Returns:
        string (unicode): A unicode string that hashes to the hash_value.
    """
    return EN.unhash(hash_value)
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,83 +1,34 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
-# Put these above import to avoid circular import problem
+
 ctypedef int ClusterID
 ctypedef uint32_t StringHash
-ctypedef size_t Lexeme_addr
+ctypedef size_t LexID
-ctypedef char Bits8
+ctypedef char OrthFlags
-ctypedef uint64_t Bits64
+ctypedef char DistFlags
-
+ctypedef uint64_t TagFlags
 cdef enum OrthFlag:
    IS_ALPHA
    IS_DIGIT
    IS_PUNCT
    IS_WHITE
    IS_LOWER
    IS_UPPER
    IS_TITLE
    IS_ASCII
 cdef enum DistFlag:
    OFT_UPPER
    OFT_TITLE
    DIST_FLAG3
    DIST_FLAG4
    DIST_FLAG5
    DIST_FLAG6
    DIST_FLAG7
    DIST_FLAG8
 cdef struct Orthography:
    StringHash shape
    StringHash norm
    StringHash last3
    Bits8 flags
 cdef struct Distribution:
    double prob
    ClusterID cluster
    Bits64 tagdict
    Bits8 flags
 cdef struct Lexeme:
    StringHash lex
    char* string
    size_t length
-    StringHash lex
+    double prob
-    Orthography orth  # Extra orthographic views
+    ClusterID cluster
-    Distribution dist # Distribution info
+    TagFlags possible_tags
    DistFlags dist_flags
    OrthFlags orth_flags
    StringHash* string_views
-cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
+cpdef char first_of(LexID lex_id) except 0
-    Orthography(0, 0, 0, 0),
+cpdef size_t length_of(LexID lex_id) except 0
-    Distribution(0.0, 0, 0, 0)
+cpdef double prob_of(LexID lex_id) except 0
-)
+cpdef ClusterID cluster_of(LexID lex_id) except 0
 cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
 cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
 cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
-cdef enum StringAttr:
+cpdef StringHash view_of(LexID lex_id, size_t view) except 0
    LEX
    NORM
    SHAPE
    LAST3
    LENGTH
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
 cpdef size_t length_of(size_t lex_id) except *
 cpdef double prob_of(size_t lex_id) except 0
 cpdef ClusterID cluster_of(size_t lex_id) except 0
 cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
 cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,32 +1,32 @@
 # cython: profile=True
 # cython: embedsignature=True
 '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
 '''
 from __future__ import unicode_literals
 from spacy.string_tools cimport substr
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
-cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
+cpdef int set_flags(LexID lex_id, object active_flags) except *:
-    if attr == LEX:
+    """Set orthographic bit flags for a Lexeme.
-        return lex_of(lex_id)
+
-    elif attr == NORM:
+    Args:
-        return norm_of(lex_id)
+        lex_id (LexemeID): A reference ID for a Lexeme.
-    elif attr == SHAPE:
+        active_flags: A sequence of bits to set as True.
-        return shape_of(lex_id)
+    """
-    elif attr == LAST3:
+    cdef size_t flag
-        return last3_of(lex_id)
+    cdef Lexeme* w = <Lexeme*>lex_id
-    elif attr == LENGTH:
+    for flag in active_flags:
-        return length_of(lex_id)
+        w.orth_flags |= 1 << flag
-    else:
+
-        raise StandardError
+
 cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
    return (<Lexeme*>lex_id).string_views[view]
 cpdef StringHash lex_of(size_t lex_id) except 0:
@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
    delimited tokens split off.  The other fields refer to properties of the
    string that the lex field stores a hash of, except sic and tail.
-    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    >>> from spacy import en
    >>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')]
    [u'Hi', u'!', u'world']
    '''
    return (<Lexeme*>lex_id).lex
-cpdef StringHash norm_of(size_t lex_id) except 0:
+cpdef ClusterID cluster_of(LexID lex_id) except 0:
    '''Access the `lex' field of the Lexeme pointed to by lex_id.
    The lex field is the hash of the string you would expect to get back from
    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
    delimited tokens split off.  The other fields refer to properties of the
    string that the lex field stores a hash of, except sic and tail.
    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
    [u'Hi', u'!', u'world']
    '''
    return (<Lexeme*>lex_id).orth.norm
 cpdef StringHash shape_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).orth.shape
 cpdef StringHash last3_of(size_t lex_id) except 0:
    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
    the hash of the last three characters of the word:
    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
    return (<Lexeme*>lex_id).orth.last3
 cpdef ClusterID cluster_of(size_t lex_id) except 0:
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
    which should be understood as a binary address:
@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0:
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
-    return (<Lexeme*>lex_id).dist.cluster
+    return (<Lexeme*>lex_id).cluster
-cpdef Py_UNICODE first_of(size_t lex_id):
+cpdef char first_of(size_t lex_id) except 0:
    '''Access the `first' field of the Lexeme pointed to by lex_id, which
    stores the first character of the lex string of the word.
@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
-    return (<Lexeme*>lex_id).orth.first
+    return (<Lexeme*>lex_id).string[0]
-cpdef size_t length_of(size_t lex_id) except *:
+cpdef size_t length_of(size_t lex_id) except 0:
    '''Access the `length' field of the Lexeme pointed to by lex_id, which stores
    the length of the string hashed by lex_of.'''
    cdef Lexeme* word = <Lexeme*>lex_id
@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0:
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
-    return (<Lexeme*>lex_id).dist.prob
+    return (<Lexeme*>lex_id).prob
 DEF OFT_UPPER = 1
 DEF OFT_TITLE = 2
 cpdef bint is_oft_upper(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id):
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
-    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
+    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
 cpdef bint is_oft_title(size_t lex_id):
@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id):
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
-    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
+    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
-cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
+cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
-    return (<Lexeme*>lex_id).orth.flags & (1 << flag)
+    return (<Lexeme*>lex_id).orth_flags & (1 << flag)
-cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
+cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
-    return (<Lexeme*>lex_id).dist.flags & (1 << flag)
+    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
 cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -19,8 +19,6 @@ ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
 cdef class Language:
@ -29,7 +27,7 @@ cdef class Language:
    cdef dense_hash_map[StringHash, size_t] vocab
    cdef dict bacov
-    cdef Tokens tokenize(self, unicode text)
+    cpdef Tokens tokenize(self, unicode text)
    cdef Lexeme* lookup(self, unicode string) except NULL
    cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
@ -37,7 +35,8 @@ cdef class Language:
    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
-    cdef unicode unhash(self, StringHash hashed)
+    cpdef unicode unhash(self, StringHash hashed)
-    cpdef list find_substrings(self, unicode word)
+    cpdef list find_substrings(self, unicode chunk)
    cdef int find_split(self, unicode word)
    cdef int set_orth(self, unicode string, Lexeme* word)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -1,4 +1,13 @@
 # cython: profile=True
 # cython: embedsignature=True
 """Common classes and utilities across languages.
 Provides the main implementation for the spacy tokenizer. Specific languages
 subclass the Language class, over-writing the tokenization rules as necessary.
 Special-case tokenization rules are read from data/<lang>/tokenization .
 """
 from __future__ import unicode_literals
 from libc.stdlib cimport calloc, free
@ -6,54 +15,13 @@ from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref
 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport BLANK_WORD
+from spacy.lexeme cimport LexID
 from spacy.string_tools cimport substr
 from . import util
 from os import path
 DIST_FLAGS = {}
 TAGS = {}
-
+DIST_FLAGS = {}
 def get_normalized(unicode lex):
    if lex.isalpha() and lex.islower():
        return lex
    else:
        return get_word_shape(lex)
 def get_word_shape(unicode lex):
    cdef size_t length = len(lex)
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in lex:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape
 def set_orth_flags(lex):
    return 0
 cdef class Language:
    def __cinit__(self, name):
@ -64,9 +32,19 @@ cdef class Language:
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
-        #self.load_dist_info(util.read_dist_info(name))
+        self.load_dist_info(util.read_dist_info(name))
-    cdef Tokens tokenize(self, unicode string):
+    cpdef Tokens tokenize(self, unicode string):
        """Tokenize.
        Split the string into tokens.
        Args:
            string (unicode): The string to split.
        Returns:
            tokens (Tokens): A Tokens object.
        """
        cdef Lexeme** chunk
        cdef Tokens tokens = Tokens(self)
        cdef size_t length = len(string)
@ -85,8 +63,7 @@ cdef class Language:
        return tokens
    cdef Lexeme* lookup(self, unicode string) except NULL:
-        if len(string) == 0:
+        assert len(string) != 0
            return &BLANK_WORD
        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
        if word == NULL:
            word = self.new_lexeme(string)
@ -113,56 +90,79 @@ cdef class Language:
        cdef bytes byte_string = string.encode('utf8')
        word.string = <char*>byte_string
        word.length = len(byte_string)
-        word.orth.flags = set_orth_flags(string)
+        self.set_orth(string, word)
        cdef unicode norm = get_normalized(string)
        cdef unicode shape = get_word_shape(string)
        cdef unicode last3 = string[-3:]
        word.lex = hash(string)
        word.orth.norm = hash(norm)
        word.orth.shape = hash(shape)
        word.orth.last3 = hash(last3)
        self.bacov[word.lex] = string
        self.bacov[word.orth.norm] = norm
        self.bacov[word.orth.shape] = shape
        self.bacov[word.orth.last3] = last3
-        self.vocab[hash(string)] = <size_t>word
+        word.lex = hash(string)
        self.bacov[word.lex] = string
        self.vocab[word.lex] = <LexID>word
        return word
-    cdef unicode unhash(self, StringHash hash_value):
+    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
-    cpdef list find_substrings(self, unicode word):
+    cpdef list find_substrings(self, unicode chunk):
        """Find how to split a chunk into substrings.
        This method calls find_split repeatedly. Most languages will want to
        override find_split, but it may be useful to override this instead.
        Args:
            chunk (unicode): The string to be split, e.g. u"Mike's!"
        Returns:
            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
        """
        substrings = []
-        while word:
+        while chunk:
-            split = self.find_split(word)
+            split = self.find_split(chunk)
            if split == 0:
-                substrings.append(word)
+                substrings.append(chunk)
                break
-            substrings.append(word[:split])
+            substrings.append(chunk[:split])
-            word = word[split:]
+            chunk = chunk[split:]
        return substrings
    cdef int find_split(self, unicode word):
        return len(word)
-    def load_tokenization(self, token_rules=None):
+    cdef int set_orth(self, unicode string, Lexeme* word):
        pass
    def load_tokenization(self, token_rules):
        '''Load special-case tokenization rules.
        Loads special-case tokenization rules into the Language.chunk cache,
        read from data/<lang>/tokenization . The special cases are loaded before
        any language data is tokenized, giving these priority.  For instance,
        the English tokenization rules map "ain't" to ["are", "not"].
        Args:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        for chunk, tokens in token_rules:
            self.new_chunk(chunk, tokens)
    def load_dist_info(self, dist_info):
        '''Load distributional information for the known lexemes of the language.
        The distributional information is read from data/<lang>/dist_info.json .
        It contains information like the (smoothed) unigram log probability of
        the word, how often the word is found upper-cased, how often the word
        is found title-cased, etc.
        '''
        cdef unicode string
        cdef dict word_dist
        cdef Lexeme* w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
-            w.dist.prob = word_dist.prob
+            w.prob = word_dist.prob
-            w.dist.cluster = word_dist.cluster
+            w.cluster = word_dist.cluster
            for flag in word_dist.flags:
-                w.dist.flags |= DIST_FLAGS[flag]
+                w.dist_flags |= DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
-                w.dist.tagdict |= TAGS[tag]
+                w.possible_tags |= TAGS[tag]
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme
 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
 from spacy.lexeme cimport StringAttr
 cdef class Tokens:
@ -15,5 +14,5 @@ cdef class Tokens:
    cpdef int append(self, Lexeme_addr token)
    cpdef int extend(self, Tokens other) except -1
-    cpdef object group_by(self, StringAttr attr)
+    cpdef object group_by(self, size_t attr)
-    cpdef dict count_by(self, StringAttr attr)
+    cpdef dict count_by(self, size_t attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
+#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
 from spacy.spacy cimport StringHash
@ -37,7 +37,7 @@ cdef class Tokens:
        for el in other:
            self.append(el)
-    cpdef object group_by(self, StringAttr attr):
+    cpdef object group_by(self, size_t attr):
        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
@ -66,7 +66,8 @@ cdef class Tokens:
        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
-            key = attr_of(t, attr)
+            #key = attr_of(t, attr)
            key = 0
            if key in indices:
                groups[indices[key]].append(t)
            else:
@ -77,12 +78,13 @@ cdef class Tokens:
                groups[-1].append(t)
        return names, hashes, groups
-    cpdef dict count_by(self, StringAttr attr):
+    cpdef dict count_by(self, size_t attr):
        counts = {}
        cdef Lexeme_addr t
        cdef StringHash key
        for t in self.vctr[0]:
-            key = attr_of(t, attr)
+            #key = attr_of(t, attr)
            key = 0
            if key not in counts:
                counts[key] = 0
            counts[key] += 1