mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'feature/scale' into develop
This commit is contained in:
		
						commit
						6da8010577
					
				| 
						 | 
					@ -1,11 +1,13 @@
 | 
				
			||||||
 | 
					# cython profile=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.stdint cimport uint64_t, int64_t
 | 
					from libc.stdint cimport uint64_t, int64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef extern from "../include/MurmurHash3.h":
 | 
					cdef extern from "../include/MurmurHash3.h":
 | 
				
			||||||
    void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out)
 | 
					    void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out) nogil
 | 
				
			||||||
    void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out)
 | 
					    void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef extern from "../include/MurmurHash2.h":
 | 
					cdef extern from "../include/MurmurHash2.h":
 | 
				
			||||||
    uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed)
 | 
					    uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed) nogil
 | 
				
			||||||
    uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed)
 | 
					    uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed) nogil
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					# cython: profile=True
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					# cython profile=True
 | 
				
			||||||
| 
						 | 
					@ -1,2 +1 @@
 | 
				
			||||||
cython
 | 
					cython
 | 
				
			||||||
sparsehash
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme
 | 
				
			||||||
from spacy.lexeme cimport Lexeme_addr
 | 
					from spacy.lexeme cimport Lexeme_addr
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Vocab VOCAB
 | 
					cdef Vocab* VOCAB
 | 
				
			||||||
cdef dict BACOV
 | 
					cdef dict BACOV
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=True
 | 
				
			||||||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
 | 
					'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
 | 
				
			||||||
so that strings can be retrieved from hashes.  Use 64-bit hash values and
 | 
					so that strings can be retrieved from hashes.  Use 64-bit hash values and
 | 
				
			||||||
boldly assume no collisions.
 | 
					boldly assume no collisions.
 | 
				
			||||||
| 
						 | 
					@ -15,19 +16,18 @@ from . import util
 | 
				
			||||||
cimport spacy
 | 
					cimport spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
BACOV = {}
 | 
					BACOV = {}
 | 
				
			||||||
VOCAB = Vocab()
 | 
					VOCAB = new Vocab(100000)
 | 
				
			||||||
VOCAB.set_empty_key(0)
 | 
					VOCAB.set_empty_key(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
 | 
					spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
 | 
					cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
 | 
				
			||||||
    return spacy.tokenize(VOCAB, BACOV, find_split, string)
 | 
					    return spacy.tokenize(VOCAB, BACOV, find_split, string)
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
					cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
				
			||||||
    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
 | 
					    return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef unicode unhash(StringHash hash_value):
 | 
					cpdef unicode unhash(StringHash hash_value):
 | 
				
			||||||
| 
						 | 
					@ -72,3 +72,6 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
 | 
				
			||||||
    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
 | 
					    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
    return not word[i].isalnum()
 | 
					    return not word[i].isalnum()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#spacy.load_browns(VOCAB, BACOV, find_split)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme
 | 
				
			||||||
from spacy.lexeme cimport Lexeme_addr
 | 
					from spacy.lexeme cimport Lexeme_addr
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Vocab VOCAB
 | 
					cdef Vocab* VOCAB
 | 
				
			||||||
cdef dict BACOV
 | 
					cdef dict BACOV
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,7 @@ from . import util
 | 
				
			||||||
cimport spacy
 | 
					cimport spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
BACOV = {}
 | 
					BACOV = {}
 | 
				
			||||||
VOCAB = Vocab()
 | 
					VOCAB = new Vocab(100000)
 | 
				
			||||||
VOCAB.set_empty_key(0)
 | 
					VOCAB.set_empty_key(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,7 +27,7 @@ cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
					cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
				
			||||||
    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
 | 
					    return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef unicode unhash(StringHash hash_value):
 | 
					cpdef unicode unhash(StringHash hash_value):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,9 +25,9 @@ cdef struct Lexeme:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
 | 
					cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
 | 
					cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
 | 
				
			||||||
                         unicode string, StringHash hashed,
 | 
					                         unicode string, StringHash hashed,
 | 
				
			||||||
                         int split, size_t length) except NULL
 | 
					                         int split, size_t length)
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 | 
					# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 | 
				
			||||||
# has a conditional to pick out the correct item.  This allows safe iteration
 | 
					# has a conditional to pick out the correct item.  This allows safe iteration
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=True
 | 
				
			||||||
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 | 
					'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 | 
				
			||||||
Mostly useful from Python-space. From Cython-space, you can just cast to
 | 
					Mostly useful from Python-space. From Cython-space, you can just cast to
 | 
				
			||||||
Lexeme* yourself.
 | 
					Lexeme* yourself.
 | 
				
			||||||
| 
						 | 
					@ -13,9 +14,9 @@ from libc.stdint cimport uint64_t
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
 | 
					cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
 | 
				
			||||||
                         unicode string, StringHash hashed,
 | 
					                         unicode string, StringHash hashed,
 | 
				
			||||||
                         int split, size_t length) except NULL:
 | 
					                         int split, size_t length):
 | 
				
			||||||
    assert split <= length
 | 
					    assert split <= length
 | 
				
			||||||
    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
 | 
					    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -54,7 +55,8 @@ cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    # Now recurse, and deal with the tail
 | 
					    # Now recurse, and deal with the tail
 | 
				
			||||||
    if tail_string:
 | 
					    if tail_string:
 | 
				
			||||||
        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
 | 
					        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string,
 | 
				
			||||||
 | 
					                                    len(tail_string))
 | 
				
			||||||
    return word
 | 
					    return word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,12 +12,13 @@ ctypedef int (*Splitter)(unicode word, size_t length)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.lexeme cimport Lexeme
 | 
					from spacy.lexeme cimport Lexeme
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
 | 
					cdef load_tokenization(Vocab* vocab, dict bacov, token_rules)
 | 
				
			||||||
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
 | 
					cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split)
 | 
				
			||||||
 | 
					cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
 | 
				
			||||||
                                  unicode string) except *
 | 
					                                  unicode string) except *
 | 
				
			||||||
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
 | 
					cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter splitter, int start,
 | 
				
			||||||
                        unicode string) except 0
 | 
					                        Py_UNICODE* string, size_t length) except 0
 | 
				
			||||||
cdef StringHash hash_string(unicode s, size_t length) except 0
 | 
					cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil
 | 
				
			||||||
cdef unicode unhash(dict bacov, StringHash hash_value)
 | 
					cdef unicode unhash(dict bacov, StringHash hash_value)
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,8 @@
 | 
				
			||||||
 | 
					# cython: profile=True
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from libc.stdlib cimport calloc, free
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ext.murmurhash cimport MurmurHash64A
 | 
					from ext.murmurhash cimport MurmurHash64A
 | 
				
			||||||
from ext.murmurhash cimport MurmurHash64B
 | 
					from ext.murmurhash cimport MurmurHash64B
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,14 +12,16 @@ from spacy.lexeme cimport BLANK_WORD
 | 
				
			||||||
from spacy.string_tools cimport is_whitespace
 | 
					from spacy.string_tools cimport is_whitespace
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
 | 
					from os import path
 | 
				
			||||||
 | 
					cimport cython
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
 | 
					cdef load_tokenization(Vocab* vocab, dict bacov, token_rules):
 | 
				
			||||||
    cdef Lexeme* word
 | 
					    cdef Lexeme* word
 | 
				
			||||||
    cdef StringHash hashed
 | 
					    cdef StringHash hashed
 | 
				
			||||||
    for chunk, lex, tokens in token_rules:
 | 
					    for chunk, lex, tokens in token_rules:
 | 
				
			||||||
        hashed = hash_string(chunk, len(chunk))
 | 
					        hashed = hash_string(chunk, len(chunk))
 | 
				
			||||||
        assert vocab[hashed] == 0, chunk
 | 
					        assert vocab[0][hashed] == 0, chunk
 | 
				
			||||||
        word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
 | 
					        word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
 | 
				
			||||||
        for i, lex in enumerate(tokens):
 | 
					        for i, lex in enumerate(tokens):
 | 
				
			||||||
            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
 | 
					            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
 | 
				
			||||||
| 
						 | 
					@ -26,7 +31,29 @@ cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
 | 
				
			||||||
            word = word.tail
 | 
					            word = word.tail
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
 | 
					cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split):
 | 
				
			||||||
 | 
					    cdef Lexeme* w
 | 
				
			||||||
 | 
					    data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
 | 
				
			||||||
 | 
					    case_stats = util.load_case_stats(data_dir)
 | 
				
			||||||
 | 
					    brown_loc = path.join(data_dir, 'clusters')
 | 
				
			||||||
 | 
					    cdef size_t start 
 | 
				
			||||||
 | 
					    cdef int end 
 | 
				
			||||||
 | 
					    with util.utf8open(brown_loc) as browns_file:
 | 
				
			||||||
 | 
					        for i, line in enumerate(browns_file):
 | 
				
			||||||
 | 
					            cluster_str, token_string, freq_str = line.split()
 | 
				
			||||||
 | 
					            # Decode as a little-endian string, so that we can do & 15 to get
 | 
				
			||||||
 | 
					            # the first 4 bits. See redshift._parse_features.pyx
 | 
				
			||||||
 | 
					            cluster = int(cluster_str[::-1], 2)
 | 
				
			||||||
 | 
					            upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
 | 
				
			||||||
 | 
					            start = 0
 | 
				
			||||||
 | 
					            end = -1
 | 
				
			||||||
 | 
					            hashed = hash_string(token_string, len(token_string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            word = _add(vocab, bacov, find_split, hashed, token_string,
 | 
				
			||||||
 | 
					                        len(token_string), len(token_string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
 | 
				
			||||||
                                  unicode string) except *:
 | 
					                                  unicode string) except *:
 | 
				
			||||||
    cdef size_t length = len(string)
 | 
					    cdef size_t length = len(string)
 | 
				
			||||||
    cdef Py_UNICODE* characters = <Py_UNICODE*>string
 | 
					    cdef Py_UNICODE* characters = <Py_UNICODE*>string
 | 
				
			||||||
| 
						 | 
					@ -35,40 +62,53 @@ cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
 | 
				
			||||||
    cdef Py_UNICODE c
 | 
					    cdef Py_UNICODE c
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
 | 
					    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
 | 
				
			||||||
    cdef unicode current = u''
 | 
					    cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
 | 
				
			||||||
 | 
					    cdef size_t word_len = 0
 | 
				
			||||||
    cdef Lexeme* token
 | 
					    cdef Lexeme* token
 | 
				
			||||||
    for i in range(length):
 | 
					    for i in range(length):
 | 
				
			||||||
        c = characters[i]
 | 
					        c = characters[i]
 | 
				
			||||||
        if is_whitespace(c):
 | 
					        if _is_whitespace(c):
 | 
				
			||||||
            if current:
 | 
					            if word_len != 0:
 | 
				
			||||||
                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
 | 
					                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
 | 
				
			||||||
                while token != NULL:
 | 
					                while token != NULL:
 | 
				
			||||||
                    tokens.push_back(<Lexeme_addr>token)
 | 
					                    tokens.push_back(<Lexeme_addr>token)
 | 
				
			||||||
                    token = token.tail
 | 
					                    token = token.tail
 | 
				
			||||||
            current = u''
 | 
					                for j in range(word_len+1):
 | 
				
			||||||
 | 
					                    current[j] = 0
 | 
				
			||||||
 | 
					                word_len = 0
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            current += c
 | 
					            current[word_len] = c
 | 
				
			||||||
    if current:
 | 
					            word_len += 1
 | 
				
			||||||
        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
 | 
					    if word_len != 0:
 | 
				
			||||||
 | 
					        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
 | 
				
			||||||
        while token != NULL:
 | 
					        while token != NULL:
 | 
				
			||||||
            tokens.push_back(<Lexeme_addr>token)
 | 
					            tokens.push_back(<Lexeme_addr>token)
 | 
				
			||||||
            token = token.tail
 | 
					            token = token.tail
 | 
				
			||||||
 | 
					    free(current)
 | 
				
			||||||
    return tokens
 | 
					    return tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
 | 
				
			||||||
 | 
					    if c == ' ':
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif c == '\n':
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif c == '\t':
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
 | 
					cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter find_split, int start,
 | 
				
			||||||
                        unicode string) except 0:
 | 
					                        Py_UNICODE* string, size_t length) except 0:
 | 
				
			||||||
    '''Fetch a Lexeme representing a word string. If the word has not been seen,
 | 
					    '''Fetch a Lexeme representing a word string. If the word has not been seen,
 | 
				
			||||||
    construct one, splitting off any attached punctuation or clitics.  A
 | 
					    construct one, splitting off any attached punctuation or clitics.  A
 | 
				
			||||||
    reference to BLANK_WORD is returned for the empty string.
 | 
					    reference to BLANK_WORD is returned for the empty string.
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
 | 
					    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    if string == '':
 | 
					    if length == 0:
 | 
				
			||||||
        return <Lexeme_addr>&BLANK_WORD
 | 
					        return <Lexeme_addr>&BLANK_WORD
 | 
				
			||||||
    cdef size_t length = len(string)
 | 
					 | 
				
			||||||
    cdef StringHash hashed = hash_string(string, length)
 | 
					    cdef StringHash hashed = hash_string(string, length)
 | 
				
			||||||
    cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
 | 
					    cdef Lexeme* word_ptr = <Lexeme*>vocab[0][hashed]
 | 
				
			||||||
    if word_ptr == NULL:
 | 
					    if word_ptr == NULL:
 | 
				
			||||||
        start = find_split(string, length) if start == -1 else start
 | 
					        start = find_split(string, length) if start == -1 else start
 | 
				
			||||||
        word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
 | 
					        word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
 | 
				
			||||||
| 
						 | 
					@ -84,9 +124,8 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
 | 
				
			||||||
    return tokens
 | 
					    return tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef StringHash hash_string(unicode s, size_t length) except 0:
 | 
					cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil:
 | 
				
			||||||
    '''Hash unicode with MurmurHash64A'''
 | 
					    '''Hash unicode with MurmurHash64A'''
 | 
				
			||||||
    assert length
 | 
					 | 
				
			||||||
    return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 | 
					    return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,11 +134,10 @@ cdef unicode unhash(dict bacov, StringHash hash_value):
 | 
				
			||||||
    return bacov[hash_value]
 | 
					    return bacov[hash_value]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
 | 
					@cython.nonecheck(False)
 | 
				
			||||||
                  unicode string, int split, size_t length) except NULL:
 | 
					cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hashed,
 | 
				
			||||||
    assert string
 | 
					                  unicode string, int split, size_t length):
 | 
				
			||||||
    assert split <= length
 | 
					 | 
				
			||||||
    word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
 | 
					    word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
 | 
				
			||||||
    vocab[hashed] = <Lexeme_addr>word
 | 
					    vocab[0][hashed] = <Lexeme_addr>word
 | 
				
			||||||
    bacov[hashed] = string
 | 
					    bacov[hashed] = string
 | 
				
			||||||
    return word
 | 
					    return word
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					# cython: profile=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef unicode substr(unicode string, int start, int end, size_t length):
 | 
					cpdef unicode substr(unicode string, int start, int end, size_t length):
 | 
				
			||||||
    if end >= length:
 | 
					    if end >= length:
 | 
				
			||||||
        end = -1
 | 
					        end = -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,7 +10,7 @@ def utf8open(loc, mode='r'):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_case_stats(data_dir):
 | 
					def load_case_stats(data_dir):
 | 
				
			||||||
    case_loc = path.join(data_dir, 'english.case')
 | 
					    case_loc = path.join(data_dir, 'case')
 | 
				
			||||||
    case_stats = {}
 | 
					    case_stats = {}
 | 
				
			||||||
    with utf8open(case_loc) as cases_file:
 | 
					    with utf8open(case_loc) as cases_file:
 | 
				
			||||||
        for line in cases_file:
 | 
					        for line in cases_file:
 | 
				
			||||||
| 
						 | 
					@ -42,46 +42,3 @@ def read_tokenization(lang):
 | 
				
			||||||
                seen.add(chunk)
 | 
					                seen.add(chunk)
 | 
				
			||||||
                entries.append((chunk, lex, pieces))
 | 
					                entries.append((chunk, lex, pieces))
 | 
				
			||||||
    return entries
 | 
					    return entries
 | 
				
			||||||
 
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
    def load_browns(self, data_dir):
 | 
					 | 
				
			||||||
        cdef Lexeme* w
 | 
					 | 
				
			||||||
        case_stats = load_case_stats(data_dir)
 | 
					 | 
				
			||||||
        brown_loc = path.join(data_dir, 'bllip-clusters')
 | 
					 | 
				
			||||||
        assert path.exists(brown_loc)
 | 
					 | 
				
			||||||
        cdef size_t start 
 | 
					 | 
				
			||||||
        cdef int end 
 | 
					 | 
				
			||||||
        with utf8open(brown_loc) as browns_file:
 | 
					 | 
				
			||||||
            for i, line in enumerate(browns_file):
 | 
					 | 
				
			||||||
                cluster_str, word, freq_str = line.split()
 | 
					 | 
				
			||||||
                # Decode as a little-endian string, so that we can do & 15 to get
 | 
					 | 
				
			||||||
                # the first 4 bits. See redshift._parse_features.pyx
 | 
					 | 
				
			||||||
                cluster = int(cluster_str[::-1], 2)
 | 
					 | 
				
			||||||
                upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
 | 
					 | 
				
			||||||
                start = 0
 | 
					 | 
				
			||||||
                end = -1
 | 
					 | 
				
			||||||
                find_slice(&start, &end, word)
 | 
					 | 
				
			||||||
                print "Load", repr(word), start, end
 | 
					 | 
				
			||||||
                w = <Lexeme*>init_word(word, start, end, cluster,
 | 
					 | 
				
			||||||
                                      upper_pc, title_pc, int(freq_str))
 | 
					 | 
				
			||||||
                self.words[_hash_str(word)] = <size_t>w
 | 
					 | 
				
			||||||
                self.strings[<size_t>w] = word
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def load_clitics(self, data_dir):
 | 
					 | 
				
			||||||
        cdef unicode orig_str
 | 
					 | 
				
			||||||
        cdef unicode clitic
 | 
					 | 
				
			||||||
        for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
 | 
					 | 
				
			||||||
            w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
 | 
					 | 
				
			||||||
            self.words[w.orig] = <size_t>w
 | 
					 | 
				
			||||||
            self.strings[<size_t>w] = orig_str
 | 
					 | 
				
			||||||
            assert len(clitic_strs) < MAX_CLITICS
 | 
					 | 
				
			||||||
            assert clitic_strs
 | 
					 | 
				
			||||||
            for i, clitic in enumerate(clitic_strs):
 | 
					 | 
				
			||||||
                # If we write punctuation here, assume we want to keep it,
 | 
					 | 
				
			||||||
                # so tell it the slice boundaries (the full string)
 | 
					 | 
				
			||||||
                w.clitics[i] = self.lookup_slice(clitic, 0, -1)
 | 
					 | 
				
			||||||
            # Ensure we null terminate
 | 
					 | 
				
			||||||
            w.clitics[i+1] = 0
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user