mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc.
This commit is contained in:
		
							parent
							
								
									5c1705d5be
								
							
						
					
					
						commit
						556f6a18ca
					
				
							
								
								
									
										28
									
								
								spacy/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								spacy/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,28 @@
 | 
			
		|||
from .lexeme import lex_of
 | 
			
		||||
from .lexeme import sic_of
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = [lex_of, sic_of]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
from .tokens import ids_from_string
 | 
			
		||||
from .tokens import group_by
 | 
			
		||||
 | 
			
		||||
from .lex import sic_of
 | 
			
		||||
from .lex import lex_of
 | 
			
		||||
from .lex import normed_of
 | 
			
		||||
from .lex import first_of
 | 
			
		||||
from .lex import last_three_of
 | 
			
		||||
 | 
			
		||||
from .lex import cluster_of
 | 
			
		||||
from .lex import prob_of
 | 
			
		||||
 | 
			
		||||
from .lex import is_oft_upper
 | 
			
		||||
from .lex import is_oft_title
 | 
			
		||||
 | 
			
		||||
from .lex import can_noun
 | 
			
		||||
from .lex import can_verb
 | 
			
		||||
from .lex import can_adj
 | 
			
		||||
from .lex import can_adv
 | 
			
		||||
"""
 | 
			
		||||
							
								
								
									
										4529
									
								
								spacy/en.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4529
									
								
								spacy/en.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										17
									
								
								spacy/en.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								spacy/en.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,17 @@
 | 
			
		|||
from ext.sparsehash cimport dense_hash_map
 | 
			
		||||
from spacy.lexeme cimport StringHash
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ctypedef Py_UNICODE* string_ptr
 | 
			
		||||
ctypedef size_t Lexeme_addr # For python interop 
 | 
			
		||||
ctypedef Lexeme* Lexeme_ptr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef Lexeme_addr lookup(unicode word) except 0
 | 
			
		||||
cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
 | 
			
		||||
cdef StringHash hash_string(unicode s, size_t length) except 0
 | 
			
		||||
cpdef unicode unhash(StringHash hash_value)
 | 
			
		||||
							
								
								
									
										165
									
								
								spacy/en.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										165
									
								
								spacy/en.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,165 @@
 | 
			
		|||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
 | 
			
		||||
so that strings can be retrieved from hashes.  Use 64-bit hash values and
 | 
			
		||||
boldly assume no collisions.
 | 
			
		||||
'''
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from libc.stdlib cimport malloc, calloc, free
 | 
			
		||||
from libc.stdint cimport uint64_t
 | 
			
		||||
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
from ext.murmurhash cimport MurmurHash64A
 | 
			
		||||
from ext.murmurhash cimport MurmurHash64B
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
STRINGS = {}
 | 
			
		||||
LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
 | 
			
		||||
LEXEMES.set_empty_key(0)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
			
		||||
    '''.. function:: enumerate(sequence[, start=0])
 | 
			
		||||
    Fetch a Lexeme representing a word string. If the word has not been seen,
 | 
			
		||||
    construct one, splitting off any attached punctuation or clitics.  A
 | 
			
		||||
    reference to BLANK_WORD is returned for the empty string.
 | 
			
		||||
    
 | 
			
		||||
    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
 | 
			
		||||
    '''
 | 
			
		||||
    if string == '':
 | 
			
		||||
        return <Lexeme_addr>&BLANK_WORD
 | 
			
		||||
    cdef size_t length = len(string)
 | 
			
		||||
    cdef StringHash hashed = hash_string(string, length)
 | 
			
		||||
    cdef Lexeme* word_ptr = LEXEMES[hashed]
 | 
			
		||||
    cdef size_t n
 | 
			
		||||
    if word_ptr == NULL:
 | 
			
		||||
        word_ptr = _add(hashed, string, _find_split(string, length), length)
 | 
			
		||||
    return <Lexeme_addr>word_ptr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
 | 
			
		||||
    '''Fetch a Lexeme representing a word string. If the word has not been seen,
 | 
			
		||||
    construct one, given the specified start and end indices.  A negative index
 | 
			
		||||
    significes 0 for start, and the string length for end --- i.e. the string
 | 
			
		||||
    will not be sliced if start == -1 and end == -1.
 | 
			
		||||
    
 | 
			
		||||
    A reference to BLANK_WORD is returned for the empty string.
 | 
			
		||||
    '''
 | 
			
		||||
    if string == '':
 | 
			
		||||
        return <Lexeme_addr>&BLANK_WORD
 | 
			
		||||
    cdef size_t length = len(string)
 | 
			
		||||
    cdef StringHash hashed = hash_string(string, length)
 | 
			
		||||
    cdef Lexeme* chunk_ptr = LEXEMES[hashed]
 | 
			
		||||
    if chunk_ptr == NULL:
 | 
			
		||||
        chunk_ptr = _add(hashed, string, start, length)
 | 
			
		||||
    return <Lexeme_addr>chunk_ptr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef StringHash hash_string(unicode s, size_t length) except 0:
 | 
			
		||||
    '''Hash unicode with MurmurHash64A'''
 | 
			
		||||
    assert length
 | 
			
		||||
    return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef unicode unhash(StringHash hash_value):
 | 
			
		||||
    '''Fetch a string from the reverse index, given its hash value.'''
 | 
			
		||||
    cdef string_ptr string = STRINGS[hash_value]
 | 
			
		||||
    if string == NULL:
 | 
			
		||||
        raise ValueError(hash_value)
 | 
			
		||||
 | 
			
		||||
    return string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef unicode normalize_word_string(unicode word):
 | 
			
		||||
    '''Return a normalized version of the word, mapping:
 | 
			
		||||
    - 4 digit strings into !YEAR
 | 
			
		||||
    - Other digit strings into !DIGITS
 | 
			
		||||
    - All other strings into lower-case
 | 
			
		||||
    '''
 | 
			
		||||
    cdef unicode s
 | 
			
		||||
    if word.isdigit() and len(word) == 4:
 | 
			
		||||
        return '!YEAR'
 | 
			
		||||
    elif word[0].isdigit():
 | 
			
		||||
        return '!DIGITS'
 | 
			
		||||
    else:
 | 
			
		||||
        return word.lower()
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
cpdef unicode _substr(unicode string, int start, int end, size_t length):
 | 
			
		||||
    if end >= length:
 | 
			
		||||
        end = -1
 | 
			
		||||
    if start >= length:
 | 
			
		||||
        start = 0
 | 
			
		||||
    if start <= 0 and end < 0:
 | 
			
		||||
        return string
 | 
			
		||||
    elif start < 0:
 | 
			
		||||
        start = 0
 | 
			
		||||
    elif end < 0:
 | 
			
		||||
        end = length
 | 
			
		||||
    return string[start:end]
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
 | 
			
		||||
    assert string
 | 
			
		||||
    assert split <= length
 | 
			
		||||
    word = _init_lexeme(string, hashed, split, length)
 | 
			
		||||
    LEXEMES[hashed] = word
 | 
			
		||||
    STRINGS[hashed] = string
 | 
			
		||||
    return word
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
 | 
			
		||||
                          int split, size_t length) except NULL:
 | 
			
		||||
    assert split <= length
 | 
			
		||||
    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
 | 
			
		||||
 | 
			
		||||
    word.first = <Py_UNICODE>(string[0] if string else 0)
 | 
			
		||||
    word.sic = hashed
 | 
			
		||||
    
 | 
			
		||||
    cdef unicode tail_string
 | 
			
		||||
    cdef unicode lex 
 | 
			
		||||
    if split != 0 and split < length:
 | 
			
		||||
        lex = _substr(string, 0, split, length)
 | 
			
		||||
        tail_string = _substr(string, split, length, length)
 | 
			
		||||
    else:
 | 
			
		||||
        lex = string
 | 
			
		||||
        tail_string = ''
 | 
			
		||||
    assert lex
 | 
			
		||||
    cdef unicode normed = normalize_word_string(lex)
 | 
			
		||||
    cdef unicode last3 = _substr(string, length - 3, length, length)
 | 
			
		||||
 | 
			
		||||
    assert normed
 | 
			
		||||
    assert len(normed)
 | 
			
		||||
    
 | 
			
		||||
    word.lex = hash_string(lex, len(lex))
 | 
			
		||||
    word.normed = hash_string(normed, len(normed))
 | 
			
		||||
    word.last3 = hash_string(last3, len(last3))
 | 
			
		||||
 | 
			
		||||
    STRINGS[word.lex] = lex
 | 
			
		||||
    STRINGS[word.normed] = normed
 | 
			
		||||
    STRINGS[word.last3] = last3
 | 
			
		||||
 | 
			
		||||
    # These are loaded later
 | 
			
		||||
    word.prob = 0
 | 
			
		||||
    word.cluster = 0
 | 
			
		||||
    word.oft_upper = False
 | 
			
		||||
    word.oft_title = False
 | 
			
		||||
    
 | 
			
		||||
    # Now recurse, and deal with the tail
 | 
			
		||||
    if tail_string:
 | 
			
		||||
        word.tail = <Lexeme*>lookup(tail_string)
 | 
			
		||||
    return word
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef size_t _find_split(unicode word, size_t length):
 | 
			
		||||
    cdef size_t i = 0
 | 
			
		||||
    if word[0].isalnum():
 | 
			
		||||
        while i < length and word[i].isalnum():
 | 
			
		||||
            i += 1
 | 
			
		||||
    else:
 | 
			
		||||
        # Split off a punctuation character, or a sequence of the same punctuation character
 | 
			
		||||
        while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
 | 
			
		||||
            i += 1
 | 
			
		||||
    return i
 | 
			
		||||
							
								
								
									
										2433
									
								
								spacy/lexeme.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2433
									
								
								spacy/lexeme.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										35
									
								
								spacy/lexeme.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								spacy/lexeme.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,35 @@
 | 
			
		|||
from libc.stdint cimport uint64_t
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ctypedef int ClusterID
 | 
			
		||||
ctypedef uint64_t StringHash
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef struct Lexeme:
 | 
			
		||||
    StringHash sic # Hash of the original string
 | 
			
		||||
    StringHash lex # Hash of the word, with punctuation and clitics split off
 | 
			
		||||
    StringHash normed # Hash of the normalized version of lex
 | 
			
		||||
    StringHash last3 # Last 3 characters of the token
 | 
			
		||||
    Py_UNICODE first # First character of the token
 | 
			
		||||
 | 
			
		||||
    double prob # What is the log probability of the lex value?
 | 
			
		||||
    ClusterID cluster # Brown cluster of the token
 | 
			
		||||
 | 
			
		||||
    bint oft_upper # Is the lowered version of the lex value often in all caps?
 | 
			
		||||
    bint oft_title # Is the lowered version of the lex value often title-cased?
 | 
			
		||||
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 | 
			
		||||
# has a conditional to pick out the correct item.  This allows safe iteration
 | 
			
		||||
# over the Lexeme, via:
 | 
			
		||||
# for field in range(LexAttr.n): get_attr(Lexeme*, field)
 | 
			
		||||
cdef enum HashFields:
 | 
			
		||||
    sic
 | 
			
		||||
    lex
 | 
			
		||||
    normed
 | 
			
		||||
    cluster
 | 
			
		||||
    n
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
 | 
			
		||||
							
								
								
									
										114
									
								
								spacy/lexeme.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								spacy/lexeme.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,114 @@
 | 
			
		|||
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 | 
			
		||||
Mostly useful from Python-space. From Cython-space, you can just cast to
 | 
			
		||||
Lexeme* yourself.
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef StringHash sic_of(size_t lex_id) except 0:
 | 
			
		||||
    '''Access the `sic' field of the Lexeme pointed to by lex_id.
 | 
			
		||||
    
 | 
			
		||||
    The sic field stores the hash of the whitespace-delimited string-chunk used to
 | 
			
		||||
    construct the Lexeme.
 | 
			
		||||
    
 | 
			
		||||
    >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
 | 
			
		||||
    [u'Hi!', u'', u'world]
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).sic
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef StringHash lex_of(size_t lex_id) except 0:
 | 
			
		||||
    '''Access the `lex' field of the Lexeme pointed to by lex_id.
 | 
			
		||||
 | 
			
		||||
    The lex field is the hash of the string you would expect to get back from
 | 
			
		||||
    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
 | 
			
		||||
    delimited tokens split off.  The other fields refer to properties of the
 | 
			
		||||
    string that the lex field stores a hash of, except sic and tail.
 | 
			
		||||
 | 
			
		||||
    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
 | 
			
		||||
    [u'Hi', u'!', u'world']
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).lex
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef ClusterID cluster_of(size_t lex_id):
 | 
			
		||||
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
 | 
			
		||||
    gives an integer representation of the cluster ID of the word, 
 | 
			
		||||
    which should be understood as a binary address:
 | 
			
		||||
 | 
			
		||||
    >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
 | 
			
		||||
    >>> token_ids = [lookup(s) for s in strings]
 | 
			
		||||
    >>> clusters = [cluster_of(t) for t in token_ids]
 | 
			
		||||
    >>> print ["{0:b"} % cluster_of(t) for t in token_ids]
 | 
			
		||||
    ["100111110110", "100111100100", "01010111011001", "100111110110"]
 | 
			
		||||
 | 
			
		||||
    The clusterings are unideal, but often slightly useful.
 | 
			
		||||
    "pineapple" and "apple" share a long prefix, indicating a similar meaning,
 | 
			
		||||
    while "dapple" is totally different. On the other hand, "scalable" receives
 | 
			
		||||
    the same cluster ID as "pineapple", which is not what we'd like.
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).cluster
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef Py_UNICODE first_of(size_t lex_id):
 | 
			
		||||
    '''Access the `first' field of the Lexeme pointed to by lex_id, which
 | 
			
		||||
    stores the first character of the lex string of the word.
 | 
			
		||||
 | 
			
		||||
    >>> lex_id = lookup(u'Hello')
 | 
			
		||||
    >>> unhash(first_of(lex_id))
 | 
			
		||||
    u'H'
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).first
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef double prob_of(size_t lex_id):
 | 
			
		||||
    '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
 | 
			
		||||
    the smoothed unigram log probability of the word, as estimated from a large
 | 
			
		||||
    text corpus.  By default, probabilities are based on counts from Gigaword,
 | 
			
		||||
    smoothed using Knesser-Ney; but any probabilities file can be supplied to
 | 
			
		||||
    load_probs.
 | 
			
		||||
    
 | 
			
		||||
    >>> prob_of(lookup(u'world'))
 | 
			
		||||
    -20.10340371976182
 | 
			
		||||
    '''
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef StringHash last3_of(size_t lex_id):
 | 
			
		||||
    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
 | 
			
		||||
    the hash of the last three characters of the word:
 | 
			
		||||
 | 
			
		||||
    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
 | 
			
		||||
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
 | 
			
		||||
    [u'llo', u'!']
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).last3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef bint is_oft_upper(size_t lex_id):
 | 
			
		||||
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
 | 
			
		||||
    stores whether the lowered version of the string hashed by `lex' is found
 | 
			
		||||
    in all-upper case frequently in a large sample of text.  Users are free
 | 
			
		||||
    to load different data, by default we use a sample from Wikipedia, with
 | 
			
		||||
    a threshold of 0.95, picked to maximize mutual information for POS tagging.
 | 
			
		||||
 | 
			
		||||
    >>> is_oft_upper(lookup(u'abc'))
 | 
			
		||||
    True
 | 
			
		||||
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
 | 
			
		||||
    True
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).oft_upper
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef bint is_oft_title(size_t lex_id):
 | 
			
		||||
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
 | 
			
		||||
    stores whether the lowered version of the string hashed by `lex' is found
 | 
			
		||||
    title-cased frequently in a large sample of text.  Users are free
 | 
			
		||||
    to load different data, by default we use a sample from Wikipedia, with
 | 
			
		||||
    a threshold of 0.3, picked to maximize mutual information for POS tagging.
 | 
			
		||||
 | 
			
		||||
    >>> is_oft_title(lookup(u'marcus'))
 | 
			
		||||
    True
 | 
			
		||||
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
 | 
			
		||||
    True
 | 
			
		||||
    '''
 | 
			
		||||
    return (<Lexeme*>lex_id).oft_title
 | 
			
		||||
							
								
								
									
										2064
									
								
								spacy/spacy.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2064
									
								
								spacy/spacy.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										5
									
								
								spacy/spacy.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								spacy/spacy.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,5 @@
 | 
			
		|||
from libcpp.vector cimport vector
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef vector[size_t] expand_chunk(size_t addr) except *
 | 
			
		||||
							
								
								
									
										72
									
								
								spacy/spacy.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								spacy/spacy.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,72 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
 | 
			
		||||
    cdef vector[size_t] tokens = vector[size_t]()
 | 
			
		||||
    word = <Lexeme*>addr
 | 
			
		||||
    while word is not NULL:
 | 
			
		||||
        tokens.push_back(<size_t>word)
 | 
			
		||||
        word = word.tail
 | 
			
		||||
    return tokens
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
cpdef vector[size_t] ids_from_text(unicode text) except *:
 | 
			
		||||
    cdef size_t length = len(text)
 | 
			
		||||
    cdef Py_UNICODE* characters = <Py_UNICODE*>text
 | 
			
		||||
 | 
			
		||||
    cdef size_t i
 | 
			
		||||
    cdef Py_UNICODE c
 | 
			
		||||
 | 
			
		||||
    cdef vector[size_t] tokens = vector[size_t]()
 | 
			
		||||
    cdef unicode current = u''
 | 
			
		||||
    cdef Lexeme* token
 | 
			
		||||
    cdef int alnum_end = -1
 | 
			
		||||
    cdef size_t alnum_start = 0
 | 
			
		||||
    cdef bint seen_alnum = False
 | 
			
		||||
    for i in range(length):
 | 
			
		||||
        c = characters[i]
 | 
			
		||||
        if is_whitespace(c):
 | 
			
		||||
            token = <Lexeme*>lookup(current)
 | 
			
		||||
            tokens.push_back(<size_t>token)
 | 
			
		||||
            clitic = 0
 | 
			
		||||
            while token.clitics[clitic]:
 | 
			
		||||
                tokens.push_back(token.clitics[clitic])
 | 
			
		||||
                clitic += 1
 | 
			
		||||
            current = u''
 | 
			
		||||
            alnum_start = 0
 | 
			
		||||
            alnum_end = -1
 | 
			
		||||
            seen_alnum = False
 | 
			
		||||
        else:
 | 
			
		||||
            if not seen_alnum and c.isalnum():
 | 
			
		||||
                alnum_start = i
 | 
			
		||||
                seen_alnum = True
 | 
			
		||||
            elif seen_alnum and alnum_end == -1 and not c.isalnum():
 | 
			
		||||
                alnum_end = i
 | 
			
		||||
            current += c
 | 
			
		||||
    if current:
 | 
			
		||||
        token = <Lexeme*>lookup(current)
 | 
			
		||||
        tokens.push_back(<size_t>token)
 | 
			
		||||
        clitic = 0
 | 
			
		||||
        while token.clitics[clitic]:
 | 
			
		||||
            tokens.push_back(token.clitics[clitic])
 | 
			
		||||
            clitic += 1
 | 
			
		||||
    return tokens
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
 | 
			
		||||
#    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef inline bint is_whitespace(Py_UNICODE c):
 | 
			
		||||
    # TODO: Support other unicode spaces
 | 
			
		||||
    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
 | 
			
		||||
    if c == u' ':
 | 
			
		||||
        return True
 | 
			
		||||
    elif c == u'\n':
 | 
			
		||||
        return True
 | 
			
		||||
    elif c == u'\t':
 | 
			
		||||
        return True
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
							
								
								
									
										75
									
								
								spacy/util.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								spacy/util.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,75 @@
 | 
			
		|||
def utf8open(loc, mode='r'):
 | 
			
		||||
    return codecs.open(loc, mode, 'utf8')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_case_stats(data_dir):
 | 
			
		||||
    case_loc = path.join(data_dir, 'english.case')
 | 
			
		||||
    case_stats = {}
 | 
			
		||||
    with utf8open(case_loc) as cases_file:
 | 
			
		||||
        for line in cases_file:
 | 
			
		||||
            word, upper, title = line.split()
 | 
			
		||||
            case_stats[word] = (float(upper), float(title))
 | 
			
		||||
    return case_stats
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_clitics(data_dir):
 | 
			
		||||
    clitics_loc = path.join(data_dir, 'clitics.txt')
 | 
			
		||||
    entries = []
 | 
			
		||||
    seen = set()
 | 
			
		||||
    with utf8open(clitics_loc) as clitics_file:
 | 
			
		||||
        for line in clitics_file:
 | 
			
		||||
            line = line.strip()
 | 
			
		||||
            if line.startswith('#'):
 | 
			
		||||
                continue
 | 
			
		||||
            if not line:
 | 
			
		||||
                continue
 | 
			
		||||
            clitics = line.split()
 | 
			
		||||
            word = clitics.pop(0)
 | 
			
		||||
            norm_form = clitics.pop(0)
 | 
			
		||||
            assert word not in seen, word
 | 
			
		||||
            seen.add(word)
 | 
			
		||||
            entries.append((word, norm_form, clitics))
 | 
			
		||||
    return entries
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
    def load_browns(self, data_dir):
 | 
			
		||||
        cdef Lexeme* w
 | 
			
		||||
        case_stats = load_case_stats(data_dir)
 | 
			
		||||
        brown_loc = path.join(data_dir, 'bllip-clusters')
 | 
			
		||||
        assert path.exists(brown_loc)
 | 
			
		||||
        cdef size_t start 
 | 
			
		||||
        cdef int end 
 | 
			
		||||
        with utf8open(brown_loc) as browns_file:
 | 
			
		||||
            for i, line in enumerate(browns_file):
 | 
			
		||||
                cluster_str, word, freq_str = line.split()
 | 
			
		||||
                # Decode as a little-endian string, so that we can do & 15 to get
 | 
			
		||||
                # the first 4 bits. See redshift._parse_features.pyx
 | 
			
		||||
                cluster = int(cluster_str[::-1], 2)
 | 
			
		||||
                upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
 | 
			
		||||
                start = 0
 | 
			
		||||
                end = -1
 | 
			
		||||
                find_slice(&start, &end, word)
 | 
			
		||||
                print "Load", repr(word), start, end
 | 
			
		||||
                w = <Lexeme*>init_word(word, start, end, cluster,
 | 
			
		||||
                                      upper_pc, title_pc, int(freq_str))
 | 
			
		||||
                self.words[_hash_str(word)] = <size_t>w
 | 
			
		||||
                self.strings[<size_t>w] = word
 | 
			
		||||
 | 
			
		||||
    def load_clitics(self, data_dir):
 | 
			
		||||
        cdef unicode orig_str
 | 
			
		||||
        cdef unicode clitic
 | 
			
		||||
        for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
 | 
			
		||||
            w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
 | 
			
		||||
            self.words[w.orig] = <size_t>w
 | 
			
		||||
            self.strings[<size_t>w] = orig_str
 | 
			
		||||
            assert len(clitic_strs) < MAX_CLITICS
 | 
			
		||||
            assert clitic_strs
 | 
			
		||||
            for i, clitic in enumerate(clitic_strs):
 | 
			
		||||
                # If we write punctuation here, assume we want to keep it,
 | 
			
		||||
                # so tell it the slice boundaries (the full string)
 | 
			
		||||
                w.clitics[i] = self.lookup_slice(clitic, 0, -1)
 | 
			
		||||
            # Ensure we null terminate
 | 
			
		||||
            w.clitics[i+1] = 0
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								tests/.test_tokenizer.py.swo
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								tests/.test_tokenizer.py.swo
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										0
									
								
								tests/my_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								tests/my_test.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										48
									
								
								tests/test_post_punct.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								tests/test_post_punct.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,48 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from spacy import lex_of
 | 
			
		||||
from spacy.spacy import expand_chunk
 | 
			
		||||
from spacy.en import lookup
 | 
			
		||||
from spacy.en import unhash
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def close_puncts():
 | 
			
		||||
    return [')', ']', '}', '*']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_close(close_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for p in close_puncts:
 | 
			
		||||
        string = word_str + p
 | 
			
		||||
        token = lookup(string)
 | 
			
		||||
        assert unhash(lex_of(token)) == word_str
 | 
			
		||||
        tokens = expand_chunk(token)
 | 
			
		||||
        assert len(tokens) == 2
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == word_str
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == p
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_two_different_close(close_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for p in close_puncts:
 | 
			
		||||
        string = word_str + p + "'"
 | 
			
		||||
        token = lookup(string)
 | 
			
		||||
        assert unhash(lex_of(token)) == word_str
 | 
			
		||||
        tokens = expand_chunk(token)
 | 
			
		||||
        assert len(tokens) == 3
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == word_str
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == p
 | 
			
		||||
        assert unhash(lex_of(tokens[2])) == "'"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_three_same_close(close_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for p in close_puncts:
 | 
			
		||||
        string = word_str + p + p + p
 | 
			
		||||
        tokens = expand_chunk(lookup(string))
 | 
			
		||||
        assert len(tokens) == 2
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == word_str
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == p + p + p
 | 
			
		||||
							
								
								
									
										50
									
								
								tests/test_pre_punct.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								tests/test_pre_punct.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,50 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from spacy import lex_of
 | 
			
		||||
from spacy.spacy import expand_chunk
 | 
			
		||||
from spacy.en import lookup
 | 
			
		||||
from spacy.en import unhash
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def open_puncts():
 | 
			
		||||
    return ['(', '[', '{', '*']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_open(open_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for p in open_puncts:
 | 
			
		||||
        string = p + word_str
 | 
			
		||||
        token = lookup(string)
 | 
			
		||||
        assert unhash(lex_of(token)) == p
 | 
			
		||||
        tokens = expand_chunk(token)
 | 
			
		||||
        assert len(tokens) == 2
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == p
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == word_str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_two_different_open(open_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for p in open_puncts:
 | 
			
		||||
        string = p + "`" + word_str
 | 
			
		||||
        token = lookup(string)
 | 
			
		||||
        assert unhash(lex_of(token)) == p
 | 
			
		||||
        tokens = expand_chunk(token)
 | 
			
		||||
        assert len(tokens) == 3
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == p
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == "`"
 | 
			
		||||
        assert unhash(lex_of(tokens[2])) == word_str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_three_same_open(open_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for p in open_puncts:
 | 
			
		||||
        string = p + p + p + word_str
 | 
			
		||||
        token = lookup(string)
 | 
			
		||||
        assert unhash(lex_of(token)) == p + p + p
 | 
			
		||||
        tokens = expand_chunk(token)
 | 
			
		||||
        assert len(tokens) == 2
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == p + p + p
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == word_str
 | 
			
		||||
							
								
								
									
										39
									
								
								tests/test_surround_punct.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								tests/test_surround_punct.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,39 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from spacy import lex_of, sic_of
 | 
			
		||||
from spacy.spacy import expand_chunk
 | 
			
		||||
from spacy.en import lookup
 | 
			
		||||
from spacy.en import unhash
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def paired_puncts():
 | 
			
		||||
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_token(paired_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for open_, close_ in paired_puncts:
 | 
			
		||||
        string = open_ + word_str + close_
 | 
			
		||||
        tokens = expand_chunk(lookup(string))
 | 
			
		||||
        assert len(tokens) == 3
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == open_
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == word_str
 | 
			
		||||
        assert unhash(lex_of(tokens[2])) == close_
 | 
			
		||||
        assert unhash(sic_of(tokens[0])) == string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_two_different(paired_puncts):
 | 
			
		||||
    word_str = 'Hello'
 | 
			
		||||
    for open_, close_ in paired_puncts:
 | 
			
		||||
        string = "`" + open_ + word_str + close_ + "'"
 | 
			
		||||
        tokens = expand_chunk(lookup(string))
 | 
			
		||||
        assert len(tokens) == 5
 | 
			
		||||
        assert unhash(lex_of(tokens[0])) == "`"
 | 
			
		||||
        assert unhash(lex_of(tokens[1])) == open_
 | 
			
		||||
        assert unhash(lex_of(tokens[2])) == word_str
 | 
			
		||||
        assert unhash(lex_of(tokens[2])) == word_str
 | 
			
		||||
        assert unhash(lex_of(tokens[3])) == close_
 | 
			
		||||
        assert unhash(lex_of(tokens[4])) == "'"
 | 
			
		||||
							
								
								
									
										30
									
								
								tests/test_vocab.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								tests/test_vocab.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,30 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from spacy import lex_of
 | 
			
		||||
from spacy.en import lookup
 | 
			
		||||
from spacy.en import unhash
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_neq():
 | 
			
		||||
    addr = lookup('Hello')
 | 
			
		||||
    assert lookup('bye') != addr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_eq():
 | 
			
		||||
    addr = lookup('Hello')
 | 
			
		||||
    assert lookup('Hello') == addr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_round_trip():
 | 
			
		||||
    hello = lookup('Hello')
 | 
			
		||||
    assert unhash(lex_of(hello)) == 'Hello'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_case_neq():
 | 
			
		||||
    addr = lookup('Hello')
 | 
			
		||||
    assert lookup('hello') != addr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_punct_neq():
 | 
			
		||||
    addr = lookup('Hello')
 | 
			
		||||
    assert lookup('Hello,') != addr
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user