mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word.
This commit is contained in:
		
							parent
							
								
									b94c9b72c9
								
							
						
					
					
						commit
						01469b0888
					
				
							
								
								
									
										21
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -39,29 +39,20 @@ cython_includes = ['.']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if 'VIRTUAL_ENV' in os.environ:
 | 
					if 'VIRTUAL_ENV' in os.environ:
 | 
				
			||||||
    includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
 | 
					    includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
 | 
				
			||||||
    cython_includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'lib', '*'))
 | 
					 | 
				
			||||||
else:
 | 
					else:
 | 
				
			||||||
    # If you're not using virtualenv, set your include dir here.
 | 
					    # If you're not using virtualenv, set your include dir here.
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
exts = [
 | 
					exts = [
 | 
				
			||||||
 | 
					    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
 | 
				
			||||||
    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
 | 
					    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
 | 
				
			||||||
              include_dirs=includes, cython_include_dirs=cython_includes),
 | 
					              include_dirs=includes),
 | 
				
			||||||
    Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes,
 | 
					    Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
 | 
				
			||||||
              cython_include_dirs=cython_includes),
 | 
					    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
 | 
				
			||||||
    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes,
 | 
					    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
 | 
				
			||||||
              cython_include_dirs=cython_includes),
 | 
					 | 
				
			||||||
    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes,
 | 
					 | 
				
			||||||
              cython_include_dirs=cython_includes),
 | 
					 | 
				
			||||||
    Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes,
 | 
					 | 
				
			||||||
              cython_include_dirs=cython_includes),
 | 
					 | 
				
			||||||
    Extension("spacy.chartree", ["spacy/chartree.pyx"], language="c++", include_dirs=includes,
 | 
					 | 
				
			||||||
              cython_include_dirs=cython_includes),
 | 
					 | 
				
			||||||
    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes,
 | 
					 | 
				
			||||||
              cython_include_dirs=cython_includes),
 | 
					 | 
				
			||||||
    Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
 | 
					    Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
 | 
				
			||||||
              include_dirs=includes, cython_include_dirs=cython_includes),
 | 
					              include_dirs=includes),
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,14 @@
 | 
				
			||||||
from .lexeme import lex_of
 | 
					from .lexeme import lex_of
 | 
				
			||||||
from .lexeme import sic_of
 | 
					 | 
				
			||||||
from .lexeme import length_of
 | 
					from .lexeme import length_of
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokens import Tokens
 | 
					from .tokens import Tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Don't know how to get the enum Python visible :(
 | 
					# Don't know how to get the enum Python visible :(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SIC = 0
 | 
					LEX = 0
 | 
				
			||||||
LEX = 1
 | 
					NORM = 1
 | 
				
			||||||
NORM = 2
 | 
					SHAPE = 2
 | 
				
			||||||
SHAPE = 3
 | 
					LAST3 = 3
 | 
				
			||||||
LAST3 = 4
 | 
					LENGTH = 4
 | 
				
			||||||
LENGTH = 5
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH]
 | 
					__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ from spacy.tokens cimport Tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class English(spacy.Language):
 | 
					cdef class English(spacy.Language):
 | 
				
			||||||
    cdef int find_split(self, unicode word, size_t length)
 | 
					    cdef int find_split(self, unicode word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef English EN
 | 
					cdef English EN
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										10
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								spacy/en.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -17,10 +17,13 @@ cimport spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class English(spacy.Language):
 | 
					cdef class English(spacy.Language):
 | 
				
			||||||
    cdef int find_split(self, unicode word, size_t length):
 | 
					    cdef int find_split(self, unicode word):
 | 
				
			||||||
 | 
					        cdef size_t length = len(word)
 | 
				
			||||||
        cdef int i = 0
 | 
					        cdef int i = 0
 | 
				
			||||||
 | 
					        if word.startswith("'s") or word.startswith("'S"):
 | 
				
			||||||
 | 
					            return 2
 | 
				
			||||||
        # Contractions
 | 
					        # Contractions
 | 
				
			||||||
        if word.endswith("'s"):
 | 
					        if word.endswith("'s") and length >= 3:
 | 
				
			||||||
            return length - 2
 | 
					            return length - 2
 | 
				
			||||||
        # Leading punctuation
 | 
					        # Leading punctuation
 | 
				
			||||||
        if is_punct(word, 0, length):
 | 
					        if is_punct(word, 0, length):
 | 
				
			||||||
| 
						 | 
					@ -36,7 +39,6 @@ cdef class English(spacy.Language):
 | 
				
			||||||
cdef bint is_punct(unicode word, size_t i, size_t length):
 | 
					cdef bint is_punct(unicode word, size_t i, size_t length):
 | 
				
			||||||
    # Don't count appostrophes as punct if the next char is a letter
 | 
					    # Don't count appostrophes as punct if the next char is a letter
 | 
				
			||||||
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
 | 
					    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
 | 
				
			||||||
        # ...Unless we're at 0
 | 
					 | 
				
			||||||
        return i == 0
 | 
					        return i == 0
 | 
				
			||||||
    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
 | 
					    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
| 
						 | 
					@ -57,7 +59,7 @@ cpdef Tokens tokenize(unicode string):
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
					cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
				
			||||||
    return EN.lookup_chunk(string)
 | 
					    return <Lexeme_addr>EN.lookup(string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef unicode unhash(StringHash hash_value):
 | 
					cpdef unicode unhash(StringHash hash_value):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ from spacy.tokens cimport Tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EnglishPTB(Language):
 | 
					cdef class EnglishPTB(Language):
 | 
				
			||||||
    cdef int find_split(self, unicode word, size_t length)
 | 
					    cdef int find_split(self, unicode word)
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef EnglishPTB EN_PTB
 | 
					cdef EnglishPTB EN_PTB
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,8 @@ cimport spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EnglishPTB(Language):
 | 
					cdef class EnglishPTB(Language):
 | 
				
			||||||
    cdef int find_split(self, unicode word, size_t length):
 | 
					    cdef int find_split(self, unicode word):
 | 
				
			||||||
 | 
					        length = len(word)
 | 
				
			||||||
        cdef int i = 0
 | 
					        cdef int i = 0
 | 
				
			||||||
        # Contractions
 | 
					        # Contractions
 | 
				
			||||||
        if word.endswith("'s"):
 | 
					        if word.endswith("'s"):
 | 
				
			||||||
| 
						 | 
					@ -53,7 +54,7 @@ cpdef Tokens tokenize(unicode string):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
					cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
				
			||||||
    return EN_PTB.lookup_chunk(string)
 | 
					    return <Lexeme_addr>EN_PTB.lookup_chunk(string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef unicode unhash(StringHash hash_value):
 | 
					cpdef unicode unhash(StringHash hash_value):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,14 +32,13 @@ cdef struct Lexeme:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Distribution* dist # Distribution info, lazy loaded
 | 
					    Distribution* dist # Distribution info, lazy loaded
 | 
				
			||||||
    Orthography* orth  # Extra orthographic views
 | 
					    Orthography* orth  # Extra orthographic views
 | 
				
			||||||
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
 | 
					    #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
 | 
					cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef enum StringAttr:
 | 
					cdef enum StringAttr:
 | 
				
			||||||
    SIC
 | 
					 | 
				
			||||||
    LEX
 | 
					    LEX
 | 
				
			||||||
    NORM
 | 
					    NORM
 | 
				
			||||||
    SHAPE
 | 
					    SHAPE
 | 
				
			||||||
| 
						 | 
					@ -49,7 +48,6 @@ cdef enum StringAttr:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 | 
					cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef StringHash sic_of(size_t lex_id) except 0
 | 
					 | 
				
			||||||
cpdef StringHash lex_of(size_t lex_id) except 0
 | 
					cpdef StringHash lex_of(size_t lex_id) except 0
 | 
				
			||||||
cpdef StringHash norm_of(size_t lex_id) except 0
 | 
					cpdef StringHash norm_of(size_t lex_id) except 0
 | 
				
			||||||
cpdef StringHash shape_of(size_t lex_id) except 0
 | 
					cpdef StringHash shape_of(size_t lex_id) except 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,9 +22,7 @@ from spacy.spacy cimport StringHash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
 | 
					cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
 | 
				
			||||||
    if attr == SIC:
 | 
					    if attr == LEX:
 | 
				
			||||||
        return sic_of(lex_id)
 | 
					 | 
				
			||||||
    elif attr == LEX:
 | 
					 | 
				
			||||||
        return lex_of(lex_id)
 | 
					        return lex_of(lex_id)
 | 
				
			||||||
    elif attr == NORM:
 | 
					    elif attr == NORM:
 | 
				
			||||||
        return norm_of(lex_id)
 | 
					        return norm_of(lex_id)
 | 
				
			||||||
| 
						 | 
					@ -38,18 +36,6 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
 | 
				
			||||||
        raise StandardError
 | 
					        raise StandardError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef StringHash sic_of(size_t lex_id) except 0:
 | 
					 | 
				
			||||||
    '''Access the `sic' field of the Lexeme pointed to by lex_id.
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    The sic field stores the hash of the whitespace-delimited string-chunk used to
 | 
					 | 
				
			||||||
    construct the Lexeme.
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
 | 
					 | 
				
			||||||
    [u'Hi!', u'', u'world]
 | 
					 | 
				
			||||||
    '''
 | 
					 | 
				
			||||||
    return (<Lexeme*>lex_id).sic
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cpdef StringHash lex_of(size_t lex_id) except 0:
 | 
					cpdef StringHash lex_of(size_t lex_id) except 0:
 | 
				
			||||||
    '''Access the `lex' field of the Lexeme pointed to by lex_id.
 | 
					    '''Access the `lex' field of the Lexeme pointed to by lex_id.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,8 +3,6 @@ from libc.stdint cimport uint32_t
 | 
				
			||||||
from libc.stdint cimport uint64_t
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from sparsehash.dense_hash_map cimport dense_hash_map
 | 
					from sparsehash.dense_hash_map cimport dense_hash_map
 | 
				
			||||||
from _hashing cimport FixedTable
 | 
					 | 
				
			||||||
from _hashing cimport WordTree
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Circular import problems here
 | 
					# Circular import problems here
 | 
				
			||||||
ctypedef size_t Lexeme_addr
 | 
					ctypedef size_t Lexeme_addr
 | 
				
			||||||
| 
						 | 
					@ -28,22 +26,21 @@ from spacy._hashing cimport WordTree
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Language:
 | 
					cdef class Language:
 | 
				
			||||||
    cdef object name
 | 
					    cdef object name
 | 
				
			||||||
    cdef WordTree vocab
 | 
					    cdef dense_hash_map[StringHash, size_t] chunks
 | 
				
			||||||
    cdef WordTree distri
 | 
					    cdef dense_hash_map[StringHash, size_t] vocab
 | 
				
			||||||
    cdef WordTree ortho
 | 
					 | 
				
			||||||
    cdef dict bacov
 | 
					    cdef dict bacov
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef Tokens tokenize(self, unicode text)
 | 
					    cdef Tokens tokenize(self, unicode text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Lexeme_addr lookup(self, unicode string) except 0
 | 
					    cdef Lexeme* lookup(self, unicode string) except NULL
 | 
				
			||||||
    cdef Lexeme_addr lookup_chunk(self, unicode string) except 0
 | 
					    cdef Lexeme** lookup_chunk(self, unicode string) except NULL
 | 
				
			||||||
    cdef Orthography* lookup_orth(self, unicode lex) except NULL
 | 
					 | 
				
			||||||
    cdef Distribution* lookup_dist(self, unicode lex) except NULL
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
 | 
					    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
 | 
				
			||||||
 | 
					    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
 | 
				
			||||||
    cdef Orthography* new_orth(self, unicode lex) except NULL
 | 
					    cdef Orthography* new_orth(self, unicode lex) except NULL
 | 
				
			||||||
    cdef Distribution* new_dist(self, unicode lex) except NULL
 | 
					    cdef Distribution* new_dist(self, unicode lex) except NULL
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    cdef unicode unhash(self, StringHash hashed)
 | 
					    cdef unicode unhash(self, StringHash hashed)
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    cdef int find_split(self, unicode word, size_t length)
 | 
					    cpdef list find_substrings(self, unicode word)
 | 
				
			||||||
 | 
					    cdef int find_split(self, unicode word)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										150
									
								
								spacy/spacy.pyx
									
									
									
									
									
								
							
							
						
						
									
										150
									
								
								spacy/spacy.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
 | 
				
			||||||
from libcpp.pair cimport pair
 | 
					from libcpp.pair cimport pair
 | 
				
			||||||
from cython.operator cimport dereference as deref
 | 
					from cython.operator cimport dereference as deref
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from murmurhash cimport mrmr
 | 
					 | 
				
			||||||
from spacy.lexeme cimport Lexeme
 | 
					from spacy.lexeme cimport Lexeme
 | 
				
			||||||
from spacy.lexeme cimport BLANK_WORD
 | 
					from spacy.lexeme cimport BLANK_WORD
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,86 +63,56 @@ cdef class Language:
 | 
				
			||||||
    def __cinit__(self, name):
 | 
					    def __cinit__(self, name):
 | 
				
			||||||
        self.name = name
 | 
					        self.name = name
 | 
				
			||||||
        self.bacov = {}
 | 
					        self.bacov = {}
 | 
				
			||||||
        self.vocab = WordTree(0, 5)
 | 
					        self.chunks = dense_hash_map[StringHash, size_t]()
 | 
				
			||||||
        self.ortho = WordTree(0, 5)
 | 
					        self.vocab = dense_hash_map[StringHash, size_t]()
 | 
				
			||||||
        self.distri = WordTree(0, 5)
 | 
					        self.chunks.set_empty_key(0)
 | 
				
			||||||
 | 
					        self.vocab.set_empty_key(0)
 | 
				
			||||||
        self.load_tokenization(util.read_tokenization(name))
 | 
					        self.load_tokenization(util.read_tokenization(name))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef Tokens tokenize(self, unicode characters):
 | 
					    cdef Tokens tokenize(self, unicode characters):
 | 
				
			||||||
        cdef size_t i = 0
 | 
					        cdef size_t i = 0
 | 
				
			||||||
        cdef size_t start = 0
 | 
					        cdef size_t start = 0
 | 
				
			||||||
 | 
					        cdef Lexeme** chunk
 | 
				
			||||||
        cdef Tokens tokens = Tokens(self)
 | 
					        cdef Tokens tokens = Tokens(self)
 | 
				
			||||||
        cdef Lexeme* token
 | 
					        for chunk_str in characters.split():
 | 
				
			||||||
        for c in characters:
 | 
					            chunk = self.lookup_chunk(chunk_str)
 | 
				
			||||||
            if _is_whitespace(c):
 | 
					            i = 0
 | 
				
			||||||
                if start < i:
 | 
					            while chunk[i] != NULL:
 | 
				
			||||||
                    token = <Lexeme*>self.lookup_chunk(characters[start:i])
 | 
					                tokens.append(<Lexeme_addr>chunk[i])
 | 
				
			||||||
                    while token != NULL:
 | 
					 | 
				
			||||||
                        tokens.append(<Lexeme_addr>token)
 | 
					 | 
				
			||||||
                        token = token.tail
 | 
					 | 
				
			||||||
                start = i + 1
 | 
					 | 
				
			||||||
                i += 1
 | 
					                i += 1
 | 
				
			||||||
        if start < i:
 | 
					 | 
				
			||||||
            token = <Lexeme*>self.lookup_chunk(characters[start:])
 | 
					 | 
				
			||||||
            while token != NULL:
 | 
					 | 
				
			||||||
                tokens.append(<Lexeme_addr>token)
 | 
					 | 
				
			||||||
                token = token.tail
 | 
					 | 
				
			||||||
        return tokens
 | 
					        return tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Lexeme_addr lookup(self, unicode string) except 0:
 | 
					    cdef Lexeme* lookup(self, unicode string) except NULL:
 | 
				
			||||||
        cdef size_t length = len(string)
 | 
					        if len(string) == 0:
 | 
				
			||||||
        if length == 0:
 | 
					            return &BLANK_WORD
 | 
				
			||||||
            return <Lexeme_addr>&BLANK_WORD
 | 
					        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
 | 
				
			||||||
 | 
					        if word == NULL:
 | 
				
			||||||
 | 
					            word = self.new_lexeme(string)
 | 
				
			||||||
 | 
					        return word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef StringHash hashed = hash(string)
 | 
					    cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
 | 
				
			||||||
        # First, check words seen 2+ times
 | 
					        assert len(string) != 0
 | 
				
			||||||
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
 | 
					        cdef Lexeme** chunk = <Lexeme**>self.chunks[hash(string)]
 | 
				
			||||||
        if word_ptr == NULL:
 | 
					 | 
				
			||||||
            word_ptr = self.new_lexeme(string, string)
 | 
					 | 
				
			||||||
        return <Lexeme_addr>word_ptr
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
 | 
					 | 
				
			||||||
        '''Fetch a Lexeme representing a word string. If the word has not been seen,
 | 
					 | 
				
			||||||
        construct one, splitting off any attached punctuation or clitics.  A
 | 
					 | 
				
			||||||
        reference to BLANK_WORD is returned for the empty string.
 | 
					 | 
				
			||||||
        '''
 | 
					 | 
				
			||||||
        cdef size_t length = len(string)
 | 
					 | 
				
			||||||
        if length == 0:
 | 
					 | 
				
			||||||
            return <Lexeme_addr>&BLANK_WORD
 | 
					 | 
				
			||||||
        # First, check words seen 2+ times
 | 
					 | 
				
			||||||
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
 | 
					 | 
				
			||||||
        cdef int split
 | 
					        cdef int split
 | 
				
			||||||
        if word_ptr == NULL:
 | 
					        if chunk == NULL:
 | 
				
			||||||
            split = self.find_split(string, length)
 | 
					            chunk = self.new_chunk(string, self.find_substrings(string))
 | 
				
			||||||
            if split != 0 and split != -1 and split < length:
 | 
					        return chunk
 | 
				
			||||||
                word_ptr = self.new_lexeme(string, string[:split])
 | 
					 | 
				
			||||||
                word_ptr.tail = <Lexeme*>self.lookup_chunk(string[split:])
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                word_ptr = self.new_lexeme(string, string)
 | 
					 | 
				
			||||||
        return <Lexeme_addr>word_ptr
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Orthography* lookup_orth(self, unicode lex):
 | 
					    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
 | 
				
			||||||
        cdef Orthography* orth = <Orthography*>self.ortho.get(lex)
 | 
					        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
 | 
				
			||||||
        if orth == NULL:
 | 
					        for i, substring in enumerate(substrings):
 | 
				
			||||||
            orth = self.new_orth(lex)
 | 
					            chunk[i] = self.lookup(substring)
 | 
				
			||||||
        return orth
 | 
					        chunk[i + 1] = NULL
 | 
				
			||||||
 | 
					        self.chunks[hash(string)] = <size_t>chunk
 | 
				
			||||||
 | 
					        return chunk
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Distribution* lookup_dist(self, unicode lex):
 | 
					    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
 | 
				
			||||||
        cdef Distribution* dist = <Distribution*>self.distri.get(lex)
 | 
					 | 
				
			||||||
        if dist == NULL:
 | 
					 | 
				
			||||||
            dist = self.new_dist(lex)
 | 
					 | 
				
			||||||
        return dist
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
 | 
					 | 
				
			||||||
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
 | 
					        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
 | 
				
			||||||
        word.sic = hash(key)
 | 
					 | 
				
			||||||
        word.lex = hash(string)
 | 
					        word.lex = hash(string)
 | 
				
			||||||
        self.bacov[word.lex] = string
 | 
					        self.bacov[word.lex] = string
 | 
				
			||||||
        self.bacov[word.sic] = key
 | 
					        word.orth = self.new_orth(string)
 | 
				
			||||||
        word.orth = self.lookup_orth(string)
 | 
					        word.dist = self.new_dist(string)
 | 
				
			||||||
        word.dist = self.lookup_dist(string)
 | 
					        self.vocab[word.lex] = <size_t>word
 | 
				
			||||||
        self.vocab.set(key, <size_t>word)
 | 
					 | 
				
			||||||
        return word
 | 
					        return word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Orthography* new_orth(self, unicode lex) except NULL:
 | 
					    cdef Orthography* new_orth(self, unicode lex) except NULL:
 | 
				
			||||||
| 
						 | 
					@ -170,30 +139,33 @@ cdef class Language:
 | 
				
			||||||
        self.bacov[orth.norm] = norm
 | 
					        self.bacov[orth.norm] = norm
 | 
				
			||||||
        self.bacov[orth.shape] = shape
 | 
					        self.bacov[orth.shape] = shape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.ortho.set(lex, <size_t>orth)
 | 
					 | 
				
			||||||
        return orth
 | 
					        return orth
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Distribution* new_dist(self, unicode lex) except NULL:
 | 
					    cdef Distribution* new_dist(self, unicode lex) except NULL:
 | 
				
			||||||
        dist = <Distribution*>calloc(1, sizeof(Distribution))
 | 
					        dist = <Distribution*>calloc(1, sizeof(Distribution))
 | 
				
			||||||
        self.distri.set(lex, <size_t>dist)
 | 
					 | 
				
			||||||
        return dist
 | 
					        return dist
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef unicode unhash(self, StringHash hash_value):
 | 
					    cdef unicode unhash(self, StringHash hash_value):
 | 
				
			||||||
        '''Fetch a string from the reverse index, given its hash value.'''
 | 
					        '''Fetch a string from the reverse index, given its hash value.'''
 | 
				
			||||||
        return self.bacov[hash_value]
 | 
					        return self.bacov[hash_value]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int find_split(self, unicode word, size_t length):
 | 
					    cpdef list find_substrings(self, unicode word):
 | 
				
			||||||
        return -1
 | 
					        substrings = []
 | 
				
			||||||
 | 
					        while word:
 | 
				
			||||||
 | 
					            split = self.find_split(word)
 | 
				
			||||||
 | 
					            if split == 0:
 | 
				
			||||||
 | 
					                substrings.append(word)
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            substrings.append(word[:split])
 | 
				
			||||||
 | 
					            word = word[split:]
 | 
				
			||||||
 | 
					        return substrings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int find_split(self, unicode word):
 | 
				
			||||||
 | 
					        return len(word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def load_tokenization(self, token_rules=None):
 | 
					    def load_tokenization(self, token_rules=None):
 | 
				
			||||||
        cdef Lexeme* word
 | 
					        for chunk, tokens in token_rules:
 | 
				
			||||||
        cdef StringHash hashed
 | 
					            self.new_chunk(chunk, tokens)
 | 
				
			||||||
        for chunk, lex, tokens in token_rules:
 | 
					 | 
				
			||||||
            word = <Lexeme*>self.new_lexeme(chunk, lex)
 | 
					 | 
				
			||||||
            for i, lex in enumerate(tokens):
 | 
					 | 
				
			||||||
                token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
 | 
					 | 
				
			||||||
                word.tail = <Lexeme*>self.new_lexeme(token_string, lex)
 | 
					 | 
				
			||||||
                word = word.tail
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def load_clusters(self):
 | 
					    def load_clusters(self):
 | 
				
			||||||
        cdef Lexeme* w
 | 
					        cdef Lexeme* w
 | 
				
			||||||
| 
						 | 
					@ -209,24 +181,4 @@ cdef class Language:
 | 
				
			||||||
                # the first 4 bits. See redshift._parse_features.pyx
 | 
					                # the first 4 bits. See redshift._parse_features.pyx
 | 
				
			||||||
                cluster = int(cluster_str[::-1], 2)
 | 
					                cluster = int(cluster_str[::-1], 2)
 | 
				
			||||||
                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
 | 
					                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
 | 
				
			||||||
                word = self.new_lexeme(token_string, token_string)
 | 
					                self.new_lexeme(token_string)
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef inline bint _is_whitespace(unsigned char c) nogil:
 | 
					 | 
				
			||||||
    if c == b' ':
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    elif c == b'\n':
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    elif c == b'\t':
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
 | 
					 | 
				
			||||||
    cdef vector[size_t] tokens = vector[size_t]()
 | 
					 | 
				
			||||||
    word = <Lexeme*>addr
 | 
					 | 
				
			||||||
    while word != NULL:
 | 
					 | 
				
			||||||
        tokens.push_back(<size_t>word)
 | 
					 | 
				
			||||||
        word = word.tail
 | 
					 | 
				
			||||||
    return tokens
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
from spacy.spacy cimport Lexeme_addr
 | 
					from spacy.spacy cimport Lexeme_addr
 | 
				
			||||||
 | 
					from spacy.lexeme cimport Lexeme
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cython.operator cimport dereference as deref
 | 
					from cython.operator cimport dereference as deref
 | 
				
			||||||
from spacy.spacy cimport Language
 | 
					from spacy.spacy cimport Language
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,13 +32,12 @@ def read_tokenization(lang):
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            pieces = line.split()
 | 
					            pieces = line.split()
 | 
				
			||||||
            chunk = pieces.pop(0)
 | 
					            chunk = pieces.pop(0)
 | 
				
			||||||
            lex = pieces.pop(0)
 | 
					 | 
				
			||||||
            assert chunk not in seen, chunk
 | 
					            assert chunk not in seen, chunk
 | 
				
			||||||
            seen.add(chunk)
 | 
					            seen.add(chunk)
 | 
				
			||||||
            entries.append((chunk, lex, pieces))
 | 
					            entries.append((chunk, list(pieces)))
 | 
				
			||||||
            if chunk[0].isalpha() and chunk[0].islower():
 | 
					            if chunk[0].isalpha() and chunk[0].islower():
 | 
				
			||||||
                chunk = chunk[0].title() + chunk[1:]
 | 
					                chunk = chunk[0].title() + chunk[1:]
 | 
				
			||||||
                lex = lex[0].title() + lex[1:]
 | 
					                pieces[0] = pieces[0][0].title() + pieces[0][1:]
 | 
				
			||||||
                seen.add(chunk)
 | 
					                seen.add(chunk)
 | 
				
			||||||
                entries.append((chunk, lex, pieces))
 | 
					                entries.append((chunk, pieces))
 | 
				
			||||||
    return entries
 | 
					    return entries
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,44 +1,43 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.spacy import expand_chunk
 | 
					from spacy.en import tokenize, lookup, unhash
 | 
				
			||||||
from spacy.en import lookup, unhash
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import lex_of
 | 
					from spacy import lex_of
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_possess():
 | 
					def test_possess():
 | 
				
			||||||
    tokens = expand_chunk(lookup("Mike's"))
 | 
					    tokens = tokenize("Mike's")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					 | 
				
			||||||
    assert unhash(lex_of(tokens[0])) == "Mike"
 | 
					    assert unhash(lex_of(tokens[0])) == "Mike"
 | 
				
			||||||
    assert unhash(lex_of(tokens[1])) == "'s"
 | 
					    assert unhash(lex_of(tokens[1])) == "'s"
 | 
				
			||||||
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_apostrophe():
 | 
					def test_apostrophe():
 | 
				
			||||||
    tokens = expand_chunk(lookup("schools'"))
 | 
					    tokens = tokenize("schools'")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert unhash(lex_of(tokens[1])) == "'"
 | 
					    assert unhash(lex_of(tokens[1])) == "'"
 | 
				
			||||||
    assert unhash(lex_of(tokens[0])) == "schools"
 | 
					    assert unhash(lex_of(tokens[0])) == "schools"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_LL():
 | 
					def test_LL():
 | 
				
			||||||
    tokens = expand_chunk(lookup("we'll"))
 | 
					    tokens = tokenize("we'll")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert unhash(lex_of(tokens[1])) == "will"
 | 
					    assert unhash(lex_of(tokens[1])) == "will"
 | 
				
			||||||
    assert unhash(lex_of(tokens[0])) == "we"
 | 
					    assert unhash(lex_of(tokens[0])) == "we"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_aint():
 | 
					def test_aint():
 | 
				
			||||||
    tokens = expand_chunk(lookup("ain't"))
 | 
					    tokens = tokenize("ain't")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert unhash(lex_of(tokens[0])) == "are"
 | 
					    assert unhash(lex_of(tokens[0])) == "are"
 | 
				
			||||||
    assert unhash(lex_of(tokens[1])) == "not"
 | 
					    assert unhash(lex_of(tokens[1])) == "not"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_capitalized():
 | 
					def test_capitalized():
 | 
				
			||||||
    tokens = expand_chunk(lookup("can't"))
 | 
					    tokens = tokenize("can't")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    tokens = expand_chunk(lookup("Can't"))
 | 
					    tokens = tokenize("Can't")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    tokens = expand_chunk(lookup("Ain't"))
 | 
					    tokens = tokenize("Ain't")
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert unhash(lex_of(tokens[0])) == "Are"
 | 
					    assert unhash(lex_of(tokens[0])) == "Are"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ import pytest
 | 
				
			||||||
from spacy import en
 | 
					from spacy import en
 | 
				
			||||||
from spacy.lexeme import lex_of
 | 
					from spacy.lexeme import lex_of
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import SIC, LEX, NORM, SHAPE, LAST3
 | 
					from spacy import LEX, NORM, SHAPE, LAST3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_group_by_lex():
 | 
					def test_group_by_lex():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import lookup, unhash
 | 
					from spacy.en import lookup, unhash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of
 | 
					from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of
 | 
				
			||||||
from spacy.lexeme import shape_of
 | 
					from spacy.lexeme import shape_of
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import lex_of
 | 
					from spacy import lex_of
 | 
				
			||||||
from spacy.spacy import expand_chunk
 | 
					 | 
				
			||||||
from spacy.en import lookup
 | 
					from spacy.en import lookup
 | 
				
			||||||
 | 
					from spacy.en import tokenize
 | 
				
			||||||
from spacy.en import unhash
 | 
					from spacy.en import unhash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
| 
						 | 
					@ -17,8 +17,7 @@ def test_close(close_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in close_puncts:
 | 
					    for p in close_puncts:
 | 
				
			||||||
        string = word_str + p
 | 
					        string = word_str + p
 | 
				
			||||||
        token = lookup(string)
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        tokens = expand_chunk(token)
 | 
					 | 
				
			||||||
        assert len(tokens) == 2
 | 
					        assert len(tokens) == 2
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == p
 | 
					        assert unhash(lex_of(tokens[1])) == p
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == word_str
 | 
					        assert unhash(lex_of(tokens[0])) == word_str
 | 
				
			||||||
| 
						 | 
					@ -28,9 +27,7 @@ def test_two_different_close(close_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in close_puncts:
 | 
					    for p in close_puncts:
 | 
				
			||||||
        string = word_str + p + "'"
 | 
					        string = word_str + p + "'"
 | 
				
			||||||
        token = lookup(string)
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert unhash(lex_of(token)) == word_str
 | 
					 | 
				
			||||||
        tokens = expand_chunk(token)
 | 
					 | 
				
			||||||
        assert len(tokens) == 3
 | 
					        assert len(tokens) == 3
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == word_str
 | 
					        assert unhash(lex_of(tokens[0])) == word_str
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == p
 | 
					        assert unhash(lex_of(tokens[1])) == p
 | 
				
			||||||
| 
						 | 
					@ -41,7 +38,7 @@ def test_three_same_close(close_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in close_puncts:
 | 
					    for p in close_puncts:
 | 
				
			||||||
        string = word_str + p + p + p
 | 
					        string = word_str + p + p + p
 | 
				
			||||||
        tokens = expand_chunk(lookup(string))
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert len(tokens) == 4
 | 
					        assert len(tokens) == 4
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == word_str
 | 
					        assert unhash(lex_of(tokens[0])) == word_str
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == p
 | 
					        assert unhash(lex_of(tokens[1])) == p
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import lex_of
 | 
					from spacy import lex_of
 | 
				
			||||||
from spacy.spacy import expand_chunk
 | 
					 | 
				
			||||||
from spacy.en import lookup
 | 
					from spacy.en import lookup
 | 
				
			||||||
 | 
					from spacy.en import tokenize
 | 
				
			||||||
from spacy.en import unhash
 | 
					from spacy.en import unhash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
| 
						 | 
					@ -17,9 +17,7 @@ def test_open(open_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in open_puncts:
 | 
					    for p in open_puncts:
 | 
				
			||||||
        string = p + word_str
 | 
					        string = p + word_str
 | 
				
			||||||
        token = lookup(string)
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert unhash(lex_of(token)) == p
 | 
					 | 
				
			||||||
        tokens = expand_chunk(token)
 | 
					 | 
				
			||||||
        assert len(tokens) == 2
 | 
					        assert len(tokens) == 2
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == p
 | 
					        assert unhash(lex_of(tokens[0])) == p
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == word_str
 | 
					        assert unhash(lex_of(tokens[1])) == word_str
 | 
				
			||||||
| 
						 | 
					@ -29,9 +27,7 @@ def test_two_different_open(open_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in open_puncts:
 | 
					    for p in open_puncts:
 | 
				
			||||||
        string = p + "`" + word_str
 | 
					        string = p + "`" + word_str
 | 
				
			||||||
        token = lookup(string)
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert unhash(lex_of(token)) == p
 | 
					 | 
				
			||||||
        tokens = expand_chunk(token)
 | 
					 | 
				
			||||||
        assert len(tokens) == 3
 | 
					        assert len(tokens) == 3
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == p
 | 
					        assert unhash(lex_of(tokens[0])) == p
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == "`"
 | 
					        assert unhash(lex_of(tokens[1])) == "`"
 | 
				
			||||||
| 
						 | 
					@ -42,9 +38,7 @@ def test_three_same_open(open_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in open_puncts:
 | 
					    for p in open_puncts:
 | 
				
			||||||
        string = p + p + p + word_str
 | 
					        string = p + p + p + word_str
 | 
				
			||||||
        token = lookup(string)
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert unhash(lex_of(token)) == p
 | 
					 | 
				
			||||||
        tokens = expand_chunk(token)
 | 
					 | 
				
			||||||
        assert len(tokens) == 4
 | 
					        assert len(tokens) == 4
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == p
 | 
					        assert unhash(lex_of(tokens[0])) == p
 | 
				
			||||||
        assert unhash(lex_of(tokens[3])) == word_str
 | 
					        assert unhash(lex_of(tokens[3])) == word_str
 | 
				
			||||||
| 
						 | 
					@ -52,6 +46,6 @@ def test_three_same_open(open_puncts):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_open_appostrophe():
 | 
					def test_open_appostrophe():
 | 
				
			||||||
    string = "'The"
 | 
					    string = "'The"
 | 
				
			||||||
    tokens = expand_chunk(lookup(string))
 | 
					    tokens = tokenize(string)
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert unhash(lex_of(tokens[0])) == "'"
 | 
					    assert unhash(lex_of(tokens[0])) == "'"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ def test_load_en():
 | 
				
			||||||
    rules = util.read_tokenization('en')
 | 
					    rules = util.read_tokenization('en')
 | 
				
			||||||
    assert len(rules) != 0
 | 
					    assert len(rules) != 0
 | 
				
			||||||
    aint = [rule for rule in rules if rule[0] == "ain't"][0]
 | 
					    aint = [rule for rule in rules if rule[0] == "ain't"][0]
 | 
				
			||||||
    chunk, lex, pieces = aint
 | 
					    chunk, pieces = aint
 | 
				
			||||||
    assert chunk == "ain't"
 | 
					    assert chunk == "ain't"
 | 
				
			||||||
    assert lex == "are"
 | 
					    assert pieces[0] == "are"
 | 
				
			||||||
    assert pieces == ["not"]
 | 
					    assert pieces[1] == "not"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import lex_of, sic_of
 | 
					from spacy import lex_of
 | 
				
			||||||
from spacy.spacy import expand_chunk
 | 
					from spacy.en import tokenize
 | 
				
			||||||
from spacy.en import lookup
 | 
					from spacy.en import lookup
 | 
				
			||||||
from spacy.en import unhash
 | 
					from spacy.en import unhash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,19 +17,18 @@ def test_token(paired_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for open_, close_ in paired_puncts:
 | 
					    for open_, close_ in paired_puncts:
 | 
				
			||||||
        string = open_ + word_str + close_
 | 
					        string = open_ + word_str + close_
 | 
				
			||||||
        tokens = expand_chunk(lookup(string))
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert len(tokens) == 3
 | 
					        assert len(tokens) == 3
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == open_
 | 
					        assert unhash(lex_of(tokens[0])) == open_
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == word_str
 | 
					        assert unhash(lex_of(tokens[1])) == word_str
 | 
				
			||||||
        assert unhash(lex_of(tokens[2])) == close_
 | 
					        assert unhash(lex_of(tokens[2])) == close_
 | 
				
			||||||
        assert unhash(sic_of(tokens[0])) == string
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_two_different(paired_puncts):
 | 
					def test_two_different(paired_puncts):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for open_, close_ in paired_puncts:
 | 
					    for open_, close_ in paired_puncts:
 | 
				
			||||||
        string = "`" + open_ + word_str + close_ + "'"
 | 
					        string = "`" + open_ + word_str + close_ + "'"
 | 
				
			||||||
        tokens = expand_chunk(lookup(string))
 | 
					        tokens = tokenize(string)
 | 
				
			||||||
        assert len(tokens) == 5
 | 
					        assert len(tokens) == 5
 | 
				
			||||||
        assert unhash(lex_of(tokens[0])) == "`"
 | 
					        assert unhash(lex_of(tokens[0])) == "`"
 | 
				
			||||||
        assert unhash(lex_of(tokens[1])) == open_
 | 
					        assert unhash(lex_of(tokens[1])) == open_
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,15 +19,12 @@ def test_two_words():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_punct():
 | 
					def test_punct():
 | 
				
			||||||
    lex_ids = tokenize('hello, possums.')
 | 
					    tokens = tokenize('hello, possums.')
 | 
				
			||||||
    assert len(lex_ids) == 4
 | 
					    assert len(tokens) == 4
 | 
				
			||||||
    assert lex_ids[0] != lookup('hello')
 | 
					    assert lex_of(tokens[0]) == lex_of(lookup('hello'))
 | 
				
			||||||
    assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
 | 
					    assert lex_of(tokens[1]) == lex_of(lookup(','))
 | 
				
			||||||
    assert lex_ids[2] == lookup('possums.')
 | 
					    assert lex_of(tokens[2]) == lex_of(lookup('possums'))
 | 
				
			||||||
    assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
 | 
					    assert lex_of(tokens[1]) != lex_of(lookup('hello'))
 | 
				
			||||||
    assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
 | 
					 | 
				
			||||||
    assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
 | 
					 | 
				
			||||||
    assert lex_ids[0] != lookup('hello.')
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_digits():
 | 
					def test_digits():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user