* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals

2025-10-30 23:47:31 +03:00 · 2014-07-07 04:21:06 +02:00 · 2014-07-07 04:21:06 +02:00 · d5bef02c72
commit d5bef02c72
parent a62c38e1ef
14 changed files with 466242 additions and 3689 deletions
--- a/data/en/case
+++ b/data/en/case
--- a/data/en/clusters
+++ b/data/en/clusters
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -0,0 +1,93 @@
+# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
+#  21:09, 25 June 2014
+#*--*  --
+#*---* ---
+#*'s  's
+
+ain't   are not
+aren't  are not
+can't   can not
+could've    could have
+couldn't    could not
+couldn't've could not have
+didn't  did not
+doesn't does not
+don't   do not
+hadn't  had not
+hadn't've   had not have
+hasn't  has not
+haven't have not
+he'd    he would
+he'd've he would have
+he'll   he will
+he's    he 's
+how'd   he would
+how'll  he will
+how's   how 's
+I'd I would
+I'd've  I would have
+I'll    I will
+I'm I am
+I've    I have
+isn't   is not
+it'd    it would
+it'd've it would have
+it'll   it will
+it's    it 's
+let's   let 's
+mightn't    might not
+mightn't've might not have
+might've    might have
+mustn't must not
+must've must have
+needn't need not
+not've  not have
+shan't  shall not
+she'd   she would
+she'd've    she would have
+she'll  she will
+she's   she 's
+should've   should have
+shouldn't   should not
+shouldn't've    should not have
+that's  that 's
+there'd there would
+there'd've  there would have
+there's there is
+they'd  there would
+they'd've   they would have
+they'll they will
+they're they are
+they've they have
+wasn't  was not
+we'd    we would
+we'd've we would have
+we'll   we will
+we're   we are
+we've   we have
+weren't were not
+what'll what will
+what're what are
+what's  what 's
+what've what have
+when's  when 's
+where'd where would
+where's where 's
+where've    where have
+who'd   who would
+who'll  who will
+who're  who are
+who's   who 's
+who've  who have
+why'll  who will
+why're  why are
+why's   why is
+won't   will not
+would've    would have
+wouldn't    would not
+wouldn't've would not have
+you'd   you would
+you'd've    you would have
+you'll  you will
+you're  you are
+you've  you have
--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,17 +1,15 @@
-from ext.sparsehash cimport dense_hash_map
-from spacy.lexeme cimport StringHash
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+from spacy.spacy cimport Vocab
 from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport Lexeme_addr


-ctypedef Py_UNICODE* string_ptr
-ctypedef size_t Lexeme_addr # For python interop 
-ctypedef Lexeme* Lexeme_ptr
-
-
-cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
+cdef Vocab VOCAB
+cdef dict BACOV


 cpdef Lexeme_addr lookup(unicode word) except 0
-cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
-cdef StringHash hash_string(unicode s, size_t length) except 0
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -9,211 +9,43 @@ from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

 from spacy.lexeme cimport Lexeme
-from ext.murmurhash cimport MurmurHash64A
-from ext.murmurhash cimport MurmurHash64B
+from spacy.string_tools cimport substr
 from . import util

+cimport spacy

-STRINGS = {}
-LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
-LEXEMES.set_empty_key(0)
+BACOV = {}
+VOCAB = Vocab()
+VOCAB.set_empty_key(0)


-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
-
-
-def load_tokenization(token_rules):
-    cdef Lexeme* word
-    cdef StringHash hashed
-    for chunk, lex, tokens in token_rules:
-        hashed = hash_string(chunk, len(chunk))
-        assert LEXEMES[hashed] == NULL
-        word = _add(hashed, lex, len(lex), len(lex))
-        for i, lex in enumerate(tokens):
-            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
-            length = len(token_string)
-            hashed = hash_string(token_string, length)
-            word.tail = _add(hashed, lex, 0, len(lex))
-            word = word.tail
-
-
-load_tokenization(util.read_tokenization('en'))
+spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))


 cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
-    cdef size_t length = len(string)
-    cdef Py_UNICODE* characters = <Py_UNICODE*>string
+    return spacy.tokenize(VOCAB, BACOV, find_split, string)
 
-    cdef size_t i
-    cdef Py_UNICODE c
-
-    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
-    cdef unicode current = u''
-    cdef Lexeme* token
-    for i in range(length):
-        c = characters[i]
-        if is_whitespace(c):
-            if current:
-                token = <Lexeme*>lookup(current)
-                while token != NULL:
-                    tokens.push_back(<Lexeme_addr>token)
-                    token = token.tail
-            current = u''
-        else:
-            current += c
-    if current:
-        token = <Lexeme*>lookup(current)
-        while token != NULL:
-            tokens.push_back(<Lexeme_addr>token)
-            token = token.tail
-    return tokens
-
-cdef inline bint is_whitespace(Py_UNICODE c):
-    # TODO: Support other unicode spaces
-    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
-    if c == u' ':
-        return True
-    elif c == u'\n':
-        return True
-    elif c == u'\t':
-        return True
-    else:
-        return False

 cpdef Lexeme_addr lookup(unicode string) except 0:
-    '''.. function:: enumerate(sequence[, start=0])
-    Fetch a Lexeme representing a word string. If the word has not been seen,
-    construct one, splitting off any attached punctuation or clitics.  A
-    reference to BLANK_WORD is returned for the empty string.
-    
-    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
-    '''
-    if string == '':
-        return <Lexeme_addr>&BLANK_WORD
-    cdef size_t length = len(string)
-    cdef StringHash hashed = hash_string(string, length)
-    cdef Lexeme* word_ptr = LEXEMES[hashed]
-    cdef size_t n
-    if word_ptr == NULL:
-        word_ptr = _add(hashed, string, _find_split(string, length), length)
-    return <Lexeme_addr>word_ptr
-
-
-cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
-    '''Fetch a Lexeme representing a word string. If the word has not been seen,
-    construct one, given the specified start and end indices.  A negative index
-    significes 0 for start, and the string length for end --- i.e. the string
-    will not be sliced if start == -1 and end == -1.
-    
-    A reference to BLANK_WORD is returned for the empty string.
-    '''
-    if string == '':
-        return <Lexeme_addr>&BLANK_WORD
-    cdef size_t length = len(string)
-    cdef StringHash hashed = hash_string(string, length)
-    cdef Lexeme* chunk_ptr = LEXEMES[hashed]
-    if chunk_ptr == NULL:
-        chunk_ptr = _add(hashed, string, start, length)
-    return <Lexeme_addr>chunk_ptr
-
-
-cdef StringHash hash_string(unicode s, size_t length) except 0:
-    '''Hash unicode with MurmurHash64A'''
-    assert length
-    return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)


 cpdef unicode unhash(StringHash hash_value):
-    '''Fetch a string from the reverse index, given its hash value.'''
-    cdef string_ptr string = STRINGS[hash_value]
-    if string == NULL:
-        raise ValueError(hash_value)
-
-    return string
+    return spacy.unhash(BACOV, hash_value)


-cdef unicode normalize_word_string(unicode word):
-    '''Return a normalized version of the word, mapping:
-    - 4 digit strings into !YEAR
-    - Other digit strings into !DIGITS
-    - All other strings into lower-case
-    '''
+cdef vector[StringHash] make_string_views(unicode word):
    cdef unicode s
-    if word.isdigit() and len(word) == 4:
-        return '!YEAR'
-    elif word[0].isdigit():
-        return '!DIGITS'
-    else:
-        return word.lower()
+    return vector[StringHash]()
+    #if word.isdigit() and len(word) == 4:
+    #    return '!YEAR'
+    #elif word[0].isdigit():
+    #    return '!DIGITS'
+    #else:
+    #    return word.lower()
  

-cpdef unicode _substr(unicode string, int start, int end, size_t length):
-    if end >= length:
-        end = -1
-    if start >= length:
-        start = 0
-    if start <= 0 and end < 0:
-        return string
-    elif start < 0:
-        start = 0
-    elif end < 0:
-        end = length
-    return string[start:end]
-  
-
-cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
-    assert string
-    assert split <= length
-    word = _init_lexeme(string, hashed, split, length)
-    LEXEMES[hashed] = word
-    STRINGS[hashed] = string
-    return word
-
-
-cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
-                          int split, size_t length) except NULL:
-    assert split <= length
-    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-
-    word.first = <Py_UNICODE>(string[0] if string else 0)
-    word.sic = hashed
-    
-    cdef unicode tail_string
-    cdef unicode lex 
-    if split != 0 and split < length:
-        lex = _substr(string, 0, split, length)
-        tail_string = _substr(string, split, length, length)
-    else:
-        lex = string
-        tail_string = ''
-    assert lex
-    cdef unicode normed = normalize_word_string(lex)
-    cdef unicode last3 = _substr(string, length - 3, length, length)
-
-    assert normed
-    assert len(normed)
-    
-    word.lex = hash_string(lex, len(lex))
-    word.normed = hash_string(normed, len(normed))
-    word.last3 = hash_string(last3, len(last3))
-
-    STRINGS[word.lex] = lex
-    STRINGS[word.normed] = normed
-    STRINGS[word.last3] = last3
-
-    # These are loaded later
-    word.prob = 0
-    word.cluster = 0
-    word.oft_upper = False
-    word.oft_title = False
-    
-    # Now recurse, and deal with the tail
-    if tail_string:
-        word.tail = <Lexeme*>lookup(tail_string)
-    return word
-
-
-cdef size_t _find_split(unicode word, size_t length):
+cdef int find_split(unicode word, size_t length):
    cdef int i = 0
    # Contractions
    if word.endswith("'s"):
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,9 +1,12 @@
 from libc.stdint cimport uint64_t

-
+# Put these above import to avoid circular import problem
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
+ctypedef size_t Lexeme_addr

+from spacy.spacy cimport Vocab
+from spacy.spacy cimport Splitter

 cdef struct Lexeme:
    StringHash sic # Hash of the original string
@ -20,6 +23,12 @@ cdef struct Lexeme:
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens


+cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+
+cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
+                         unicode string, StringHash hashed,
+                         int split, size_t length) except NULL
+ 
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
 # over the Lexeme, via:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -2,6 +2,60 @@
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
 '''
+from __future__ import unicode_literals
+
+from spacy.string_tools cimport substr
+from spacy.spacy cimport hash_string
+from spacy.spacy cimport lookup
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+
+cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
+                         unicode string, StringHash hashed,
+                         int split, size_t length) except NULL:
+    assert split <= length
+    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+
+    word.first = <Py_UNICODE>(string[0] if string else 0)
+    word.sic = hashed
+    
+    cdef unicode tail_string
+    cdef unicode lex 
+    if split != 0 and split < length:
+        lex = substr(string, 0, split, length)
+        tail_string = substr(string, split, length, length)
+    else:
+        lex = string
+        tail_string = ''
+    assert lex
+    #cdef unicode normed = normalize_word_string(lex)
+    cdef unicode normed = '?'
+    cdef unicode last3 = substr(string, length - 3, length, length)
+
+    assert normed
+    assert len(normed)
+    
+    word.lex = hash_string(lex, len(lex))
+    word.normed = hash_string(normed, len(normed))
+    word.last3 = hash_string(last3, len(last3))
+
+    bacov[word.lex] = lex
+    bacov[word.normed] = normed
+    bacov[word.last3] = last3
+
+    # These are loaded later
+    word.prob = 0
+    word.cluster = 0
+    word.oft_upper = False
+    word.oft_title = False
+    
+    # Now recurse, and deal with the tail
+    if tail_string:
+        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
+    return word


 cpdef StringHash sic_of(size_t lex_id) except 0:
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,5 +1,24 @@
 from libcpp.vector cimport vector
+from libc.stdint cimport uint64_t
+
+from ext.sparsehash cimport dense_hash_map
+
+# Circular import problems here
+ctypedef size_t Lexeme_addr
+ctypedef uint64_t StringHash
+ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
+ctypedef int (*Splitter)(unicode word, size_t length)
+
+
 from spacy.lexeme cimport Lexeme

+cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
+cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
+                                  unicode string) except *
+cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
+                        unicode string) except 0
+cdef StringHash hash_string(unicode s, size_t length) except 0
+cdef unicode unhash(dict bacov, StringHash hash_value)
+ 
 
 cpdef vector[size_t] expand_chunk(size_t addr) except *
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -1,5 +1,78 @@
 from __future__ import unicode_literals
-from spacy.lexeme cimport Lexeme
+
+from ext.murmurhash cimport MurmurHash64A
+from ext.murmurhash cimport MurmurHash64B
+
+from spacy.lexeme cimport init_lexeme
+from spacy.lexeme cimport BLANK_WORD
+
+from spacy.string_tools cimport is_whitespace
+
+from . import util
+
+
+cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
+    cdef Lexeme* word
+    cdef StringHash hashed
+    for chunk, lex, tokens in token_rules:
+        hashed = hash_string(chunk, len(chunk))
+        assert vocab[hashed] == 0
+        word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
+        for i, lex in enumerate(tokens):
+            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
+            length = len(token_string)
+            hashed = hash_string(token_string, length)
+            word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
+            word = word.tail
+
+
+cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
+                                  unicode string) except *:
+    cdef size_t length = len(string)
+    cdef Py_UNICODE* characters = <Py_UNICODE*>string
+
+    cdef size_t i
+    cdef Py_UNICODE c
+
+    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
+    cdef unicode current = u''
+    cdef Lexeme* token
+    for i in range(length):
+        c = characters[i]
+        if is_whitespace(c):
+            if current:
+                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
+                while token != NULL:
+                    tokens.push_back(<Lexeme_addr>token)
+                    token = token.tail
+            current = u''
+        else:
+            current += c
+    if current:
+        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
+        while token != NULL:
+            tokens.push_back(<Lexeme_addr>token)
+            token = token.tail
+    return tokens
+
+
+cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
+                        unicode string) except 0:
+    '''Fetch a Lexeme representing a word string. If the word has not been seen,
+    construct one, splitting off any attached punctuation or clitics.  A
+    reference to BLANK_WORD is returned for the empty string.
+    
+    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
+    '''
+    if string == '':
+        return <Lexeme_addr>&BLANK_WORD
+    cdef size_t length = len(string)
+    cdef StringHash hashed = hash_string(string, length)
+    cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
+    if word_ptr == NULL:
+        start = find_split(string, length) if start == -1 else start
+        word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
+    return <Lexeme_addr>word_ptr


 cpdef vector[size_t] expand_chunk(size_t addr) except *:
@ -11,3 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
    return tokens


+cdef StringHash hash_string(unicode s, size_t length) except 0:
+    '''Hash unicode with MurmurHash64A'''
+    assert length
+    return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
+
+
+cdef unicode unhash(dict bacov, StringHash hash_value):
+    '''Fetch a string from the reverse index, given its hash value.'''
+    return bacov[hash_value]
+
+
+cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
+                  unicode string, int split, size_t length) except NULL:
+    assert string
+    assert split <= length
+    word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
+    vocab[hashed] = <Lexeme_addr>word
+    bacov[hashed] = string
+    return word
--- a/spacy/string_tools.pxd
+++ b/spacy/string_tools.pxd
@ -0,0 +1,3 @@
+cpdef unicode substr(unicode string, int start, int end, size_t length)
+
+cdef bint is_whitespace(Py_UNICODE c)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -0,0 +1,25 @@
+cpdef unicode substr(unicode string, int start, int end, size_t length):
+    if end >= length:
+        end = -1
+    if start >= length:
+        start = 0
+    if start <= 0 and end < 0:
+        return string
+    elif start < 0:
+        start = 0
+    elif end < 0:
+        end = length
+    return string[start:end]
+  
+
+cdef bint is_whitespace(Py_UNICODE c):
+    # TODO: Support other unicode spaces
+    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+    if c == u' ':
+        return True
+    elif c == u'\n':
+        return True
+    elif c == u'\t':
+        return True
+    else:
+        return False