* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals

2025-10-30 15:37:29 +03:00 · 2014-07-07 04:21:06 +02:00 · 2014-07-07 04:21:06 +02:00 · d5bef02c72
commit d5bef02c72
parent a62c38e1ef
14 changed files with 466242 additions and 3689 deletions
--- a/data/en/case
+++ b/data/en/case
--- a/data/en/clusters
+++ b/data/en/clusters
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -0,0 +1,93 @@
 # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
 #  21:09, 25 June 2014
 #*--*  --
 #*---* ---
 #*'s  's
 ain't   are not
 aren't  are not
 can't   can not
 could've    could have
 couldn't    could not
 couldn't've could not have
 didn't  did not
 doesn't does not
 don't   do not
 hadn't  had not
 hadn't've   had not have
 hasn't  has not
 haven't have not
 he'd    he would
 he'd've he would have
 he'll   he will
 he's    he 's
 how'd   he would
 how'll  he will
 how's   how 's
 I'd I would
 I'd've  I would have
 I'll    I will
 I'm I am
 I've    I have
 isn't   is not
 it'd    it would
 it'd've it would have
 it'll   it will
 it's    it 's
 let's   let 's
 mightn't    might not
 mightn't've might not have
 might've    might have
 mustn't must not
 must've must have
 needn't need not
 not've  not have
 shan't  shall not
 she'd   she would
 she'd've    she would have
 she'll  she will
 she's   she 's
 should've   should have
 shouldn't   should not
 shouldn't've    should not have
 that's  that 's
 there'd there would
 there'd've  there would have
 there's there is
 they'd  there would
 they'd've   they would have
 they'll they will
 they're they are
 they've they have
 wasn't  was not
 we'd    we would
 we'd've we would have
 we'll   we will
 we're   we are
 we've   we have
 weren't were not
 what'll what will
 what're what are
 what's  what 's
 what've what have
 when's  when 's
 where'd where would
 where's where 's
 where've    where have
 who'd   who would
 who'll  who will
 who're  who are
 who's   who 's
 who've  who have
 why'll  who will
 why're  why are
 why's   why is
 won't   will not
 would've    would have
 wouldn't    would not
 wouldn't've would not have
 you'd   you would
 you'd've    you would have
 you'll  you will
 you're  you are
 you've  you have
--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,17 +1,15 @@
-from ext.sparsehash cimport dense_hash_map
+from libcpp.vector cimport vector
-from spacy.lexeme cimport StringHash
+
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Vocab
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Lexeme_addr
-ctypedef Py_UNICODE* string_ptr
+cdef Vocab VOCAB
-ctypedef size_t Lexeme_addr # For python interop 
+cdef dict BACOV
 ctypedef Lexeme* Lexeme_ptr
 cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
 cpdef Lexeme_addr lookup(unicode word) except 0
-cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *
 cdef StringHash hash_string(unicode s, size_t length) except 0
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -9,211 +9,43 @@ from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.lexeme cimport Lexeme
-from ext.murmurhash cimport MurmurHash64A
+from spacy.string_tools cimport substr
 from ext.murmurhash cimport MurmurHash64B
 from . import util
 cimport spacy
-STRINGS = {}
+BACOV = {}
-LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
+VOCAB = Vocab()
-LEXEMES.set_empty_key(0)
+VOCAB.set_empty_key(0)
-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
 def load_tokenization(token_rules):
    cdef Lexeme* word
    cdef StringHash hashed
    for chunk, lex, tokens in token_rules:
        hashed = hash_string(chunk, len(chunk))
        assert LEXEMES[hashed] == NULL
        word = _add(hashed, lex, len(lex), len(lex))
        for i, lex in enumerate(tokens):
            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
            length = len(token_string)
            hashed = hash_string(token_string, length)
            word.tail = _add(hashed, lex, 0, len(lex))
            word = word.tail
 load_tokenization(util.read_tokenization('en'))
 cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
-    cdef size_t length = len(string)
+    return spacy.tokenize(VOCAB, BACOV, find_split, string)
    cdef Py_UNICODE* characters = <Py_UNICODE*>string
    cdef size_t i
    cdef Py_UNICODE c
    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
    cdef unicode current = u''
    cdef Lexeme* token
    for i in range(length):
        c = characters[i]
        if is_whitespace(c):
            if current:
                token = <Lexeme*>lookup(current)
                while token != NULL:
                    tokens.push_back(<Lexeme_addr>token)
                    token = token.tail
            current = u''
        else:
            current += c
    if current:
        token = <Lexeme*>lookup(current)
        while token != NULL:
            tokens.push_back(<Lexeme_addr>token)
            token = token.tail
    return tokens
 cdef inline bint is_whitespace(Py_UNICODE c):
    # TODO: Support other unicode spaces
    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
    if c == u' ':
        return True
    elif c == u'\n':
        return True
    elif c == u'\t':
        return True
    else:
        return False
 cpdef Lexeme_addr lookup(unicode string) except 0:
-    '''.. function:: enumerate(sequence[, start=0])
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
    Fetch a Lexeme representing a word string. If the word has not been seen,
    construct one, splitting off any attached punctuation or clitics.  A
    reference to BLANK_WORD is returned for the empty string.
    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
    '''
    if string == '':
        return <Lexeme_addr>&BLANK_WORD
    cdef size_t length = len(string)
    cdef StringHash hashed = hash_string(string, length)
    cdef Lexeme* word_ptr = LEXEMES[hashed]
    cdef size_t n
    if word_ptr == NULL:
        word_ptr = _add(hashed, string, _find_split(string, length), length)
    return <Lexeme_addr>word_ptr
 cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
    '''Fetch a Lexeme representing a word string. If the word has not been seen,
    construct one, given the specified start and end indices.  A negative index
    significes 0 for start, and the string length for end --- i.e. the string
    will not be sliced if start == -1 and end == -1.
    A reference to BLANK_WORD is returned for the empty string.
    '''
    if string == '':
        return <Lexeme_addr>&BLANK_WORD
    cdef size_t length = len(string)
    cdef StringHash hashed = hash_string(string, length)
    cdef Lexeme* chunk_ptr = LEXEMES[hashed]
    if chunk_ptr == NULL:
        chunk_ptr = _add(hashed, string, start, length)
    return <Lexeme_addr>chunk_ptr
 cdef StringHash hash_string(unicode s, size_t length) except 0:
    '''Hash unicode with MurmurHash64A'''
    assert length
    return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
 cpdef unicode unhash(StringHash hash_value):
-    '''Fetch a string from the reverse index, given its hash value.'''
+    return spacy.unhash(BACOV, hash_value)
    cdef string_ptr string = STRINGS[hash_value]
    if string == NULL:
        raise ValueError(hash_value)
    return string
-cdef unicode normalize_word_string(unicode word):
+cdef vector[StringHash] make_string_views(unicode word):
    '''Return a normalized version of the word, mapping:
    - 4 digit strings into !YEAR
    - Other digit strings into !DIGITS
    - All other strings into lower-case
    '''
    cdef unicode s
-    if word.isdigit() and len(word) == 4:
+    return vector[StringHash]()
-        return '!YEAR'
+    #if word.isdigit() and len(word) == 4:
-    elif word[0].isdigit():
+    #    return '!YEAR'
-        return '!DIGITS'
+    #elif word[0].isdigit():
-    else:
+    #    return '!DIGITS'
-        return word.lower()
+    #else:
    #    return word.lower()
-cpdef unicode _substr(unicode string, int start, int end, size_t length):
+cdef int find_split(unicode word, size_t length):
    if end >= length:
        end = -1
    if start >= length:
        start = 0
    if start <= 0 and end < 0:
        return string
    elif start < 0:
        start = 0
    elif end < 0:
        end = length
    return string[start:end]
 cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
    assert string
    assert split <= length
    word = _init_lexeme(string, hashed, split, length)
    LEXEMES[hashed] = word
    STRINGS[hashed] = string
    return word
 cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
                          int split, size_t length) except NULL:
    assert split <= length
    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
    word.first = <Py_UNICODE>(string[0] if string else 0)
    word.sic = hashed
    cdef unicode tail_string
    cdef unicode lex 
    if split != 0 and split < length:
        lex = _substr(string, 0, split, length)
        tail_string = _substr(string, split, length, length)
    else:
        lex = string
        tail_string = ''
    assert lex
    cdef unicode normed = normalize_word_string(lex)
    cdef unicode last3 = _substr(string, length - 3, length, length)
    assert normed
    assert len(normed)
    word.lex = hash_string(lex, len(lex))
    word.normed = hash_string(normed, len(normed))
    word.last3 = hash_string(last3, len(last3))
    STRINGS[word.lex] = lex
    STRINGS[word.normed] = normed
    STRINGS[word.last3] = last3
    # These are loaded later
    word.prob = 0
    word.cluster = 0
    word.oft_upper = False
    word.oft_title = False
    # Now recurse, and deal with the tail
    if tail_string:
        word.tail = <Lexeme*>lookup(tail_string)
    return word
 cdef size_t _find_split(unicode word, size_t length):
    cdef int i = 0
    # Contractions
    if word.endswith("'s"):
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,9 +1,12 @@
 from libc.stdint cimport uint64_t
-
+# Put these above import to avoid circular import problem
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
 ctypedef size_t Lexeme_addr
 from spacy.spacy cimport Vocab
 from spacy.spacy cimport Splitter
 cdef struct Lexeme:
    StringHash sic # Hash of the original string
@ -20,6 +23,12 @@ cdef struct Lexeme:
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
 cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
 cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
                         unicode string, StringHash hashed,
                         int split, size_t length) except NULL
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
 # over the Lexeme, via:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -2,6 +2,60 @@
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
 '''
 from __future__ import unicode_literals
 from spacy.string_tools cimport substr
 from spacy.spacy cimport hash_string
 from spacy.spacy cimport lookup
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
                         unicode string, StringHash hashed,
                         int split, size_t length) except NULL:
    assert split <= length
    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
    word.first = <Py_UNICODE>(string[0] if string else 0)
    word.sic = hashed
    cdef unicode tail_string
    cdef unicode lex 
    if split != 0 and split < length:
        lex = substr(string, 0, split, length)
        tail_string = substr(string, split, length, length)
    else:
        lex = string
        tail_string = ''
    assert lex
    #cdef unicode normed = normalize_word_string(lex)
    cdef unicode normed = '?'
    cdef unicode last3 = substr(string, length - 3, length, length)
    assert normed
    assert len(normed)
    word.lex = hash_string(lex, len(lex))
    word.normed = hash_string(normed, len(normed))
    word.last3 = hash_string(last3, len(last3))
    bacov[word.lex] = lex
    bacov[word.normed] = normed
    bacov[word.last3] = last3
    # These are loaded later
    word.prob = 0
    word.cluster = 0
    word.oft_upper = False
    word.oft_title = False
    # Now recurse, and deal with the tail
    if tail_string:
        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
    return word
 cpdef StringHash sic_of(size_t lex_id) except 0:
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,5 +1,24 @@
 from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t
 from ext.sparsehash cimport dense_hash_map
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
 ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
 ctypedef int (*Splitter)(unicode word, size_t length)
 from spacy.lexeme cimport Lexeme
 cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
 cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
                                  unicode string) except *
 cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
                        unicode string) except 0
 cdef StringHash hash_string(unicode s, size_t length) except 0
 cdef unicode unhash(dict bacov, StringHash hash_value)
 cpdef vector[size_t] expand_chunk(size_t addr) except *
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -1,5 +1,78 @@
 from __future__ import unicode_literals
-from spacy.lexeme cimport Lexeme
+
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B
 from spacy.lexeme cimport init_lexeme
 from spacy.lexeme cimport BLANK_WORD
 from spacy.string_tools cimport is_whitespace
 from . import util
 cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
    cdef Lexeme* word
    cdef StringHash hashed
    for chunk, lex, tokens in token_rules:
        hashed = hash_string(chunk, len(chunk))
        assert vocab[hashed] == 0
        word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
        for i, lex in enumerate(tokens):
            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
            length = len(token_string)
            hashed = hash_string(token_string, length)
            word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
            word = word.tail
 cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
                                  unicode string) except *:
    cdef size_t length = len(string)
    cdef Py_UNICODE* characters = <Py_UNICODE*>string
    cdef size_t i
    cdef Py_UNICODE c
    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
    cdef unicode current = u''
    cdef Lexeme* token
    for i in range(length):
        c = characters[i]
        if is_whitespace(c):
            if current:
                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
                while token != NULL:
                    tokens.push_back(<Lexeme_addr>token)
                    token = token.tail
            current = u''
        else:
            current += c
    if current:
        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
        while token != NULL:
            tokens.push_back(<Lexeme_addr>token)
            token = token.tail
    return tokens
 cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
                        unicode string) except 0:
    '''Fetch a Lexeme representing a word string. If the word has not been seen,
    construct one, splitting off any attached punctuation or clitics.  A
    reference to BLANK_WORD is returned for the empty string.
    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
    '''
    if string == '':
        return <Lexeme_addr>&BLANK_WORD
    cdef size_t length = len(string)
    cdef StringHash hashed = hash_string(string, length)
    cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
    if word_ptr == NULL:
        start = find_split(string, length) if start == -1 else start
        word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
    return <Lexeme_addr>word_ptr
 cpdef vector[size_t] expand_chunk(size_t addr) except *:
@ -11,3 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
    return tokens
 cdef StringHash hash_string(unicode s, size_t length) except 0:
    '''Hash unicode with MurmurHash64A'''
    assert length
    return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 cdef unicode unhash(dict bacov, StringHash hash_value):
    '''Fetch a string from the reverse index, given its hash value.'''
    return bacov[hash_value]
 cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
                  unicode string, int split, size_t length) except NULL:
    assert string
    assert split <= length
    word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
    vocab[hashed] = <Lexeme_addr>word
    bacov[hashed] = string
    return word
--- a/spacy/string_tools.pxd
+++ b/spacy/string_tools.pxd
@ -0,0 +1,3 @@
 cpdef unicode substr(unicode string, int start, int end, size_t length)
 cdef bint is_whitespace(Py_UNICODE c)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -0,0 +1,25 @@
 cpdef unicode substr(unicode string, int start, int end, size_t length):
    if end >= length:
        end = -1
    if start >= length:
        start = 0
    if start <= 0 and end < 0:
        return string
    elif start < 0:
        start = 0
    elif end < 0:
        end = length
    return string[start:end]
 cdef bint is_whitespace(Py_UNICODE c):
    # TODO: Support other unicode spaces
    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
    if c == u' ':
        return True
    elif c == u'\n':
        return True
    elif c == u'\t':
        return True
    else:
        return False
		`@ -0,0 +1,3 @@`
							`cpdef unicode substr(unicode string, int start, int end, size_t length)`

							`cdef bint is_whitespace(Py_UNICODE c)`