mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Struggling with arbitrary attr access...
This commit is contained in:
		
							parent
							
								
									314658b31c
								
							
						
					
					
						commit
						811b7a6b91
					
				| 
						 | 
				
			
			@ -1,5 +0,0 @@
 | 
			
		|||
Cython API
 | 
			
		||||
==========
 | 
			
		||||
 | 
			
		||||
Cheat Sheet
 | 
			
		||||
-----------
 | 
			
		||||
| 
						 | 
				
			
			@ -1,2 +0,0 @@
 | 
			
		|||
Adding a Language
 | 
			
		||||
=================
 | 
			
		||||
| 
						 | 
				
			
			@ -1,45 +0,0 @@
 | 
			
		|||
Python API
 | 
			
		||||
==========
 | 
			
		||||
 | 
			
		||||
.. py:currentmodule:: spacy.en
 | 
			
		||||
 | 
			
		||||
To and from unicode strings
 | 
			
		||||
---------------------------
 | 
			
		||||
 | 
			
		||||
.. autofunction:: tokenize
 | 
			
		||||
.. autofunction:: lookup
 | 
			
		||||
.. autofunction:: unhash
 | 
			
		||||
 | 
			
		||||
Access (Hashed) String Views
 | 
			
		||||
----------------------------
 | 
			
		||||
 | 
			
		||||
.. autofunction:: lex_of
 | 
			
		||||
.. autofunction:: norm_of
 | 
			
		||||
.. autofunction:: shape_of
 | 
			
		||||
.. autofunction:: last3_of
 | 
			
		||||
 | 
			
		||||
Access String Properties
 | 
			
		||||
------------------------
 | 
			
		||||
 | 
			
		||||
.. autofunction:: length_of
 | 
			
		||||
.. autofunction:: first_of
 | 
			
		||||
 | 
			
		||||
Check Orthographic Flags
 | 
			
		||||
-------------------------
 | 
			
		||||
 | 
			
		||||
.. autofunction:: is_alpha
 | 
			
		||||
.. autofunction:: is_digit
 | 
			
		||||
.. autofunction:: is_punct
 | 
			
		||||
.. autofunction:: is_space
 | 
			
		||||
.. autofunction:: is_lower
 | 
			
		||||
.. autofunction:: is_upper
 | 
			
		||||
.. autofunction:: is_title
 | 
			
		||||
.. autofunction:: is_ascii
 | 
			
		||||
 | 
			
		||||
Access Distributional Information
 | 
			
		||||
---------------------------------
 | 
			
		||||
 | 
			
		||||
.. autofunction:: prob_of
 | 
			
		||||
.. autofunction:: cluster_of
 | 
			
		||||
.. autofunction:: check_tag_flag
 | 
			
		||||
.. autofunction:: check_dist_flag
 | 
			
		||||
							
								
								
									
										26
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								spacy/en.pxd
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1,19 +1,39 @@
 | 
			
		|||
from libcpp.vector cimport vector
 | 
			
		||||
 | 
			
		||||
from spacy.spacy cimport StringHash
 | 
			
		||||
from spacy.spacy cimport Lexeme
 | 
			
		||||
from spacy.spacy cimport Lexeme_addr
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
from spacy.lexeme cimport LexID
 | 
			
		||||
from spacy.lexeme cimport ClusterID
 | 
			
		||||
 | 
			
		||||
from spacy.spacy cimport Language
 | 
			
		||||
from spacy.tokens cimport Tokens
 | 
			
		||||
cimport cython
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ctypedef fused AttrType:
 | 
			
		||||
    ClusterID
 | 
			
		||||
    StringHash
 | 
			
		||||
    cython.char
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef enum AttrName:
 | 
			
		||||
    LEX
 | 
			
		||||
    FIRST
 | 
			
		||||
    LENGTH
 | 
			
		||||
    CLUSTER
 | 
			
		||||
    NORM
 | 
			
		||||
    SHAPE
 | 
			
		||||
    LAST3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class English(spacy.Language):
 | 
			
		||||
    cdef int find_split(self, unicode word)
 | 
			
		||||
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1
 | 
			
		||||
    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
 | 
			
		||||
 | 
			
		||||
cdef English EN
 | 
			
		||||
 | 
			
		||||
cpdef Lexeme_addr lookup(unicode word) except 0
 | 
			
		||||
cpdef LexID lookup(unicode word) except 0
 | 
			
		||||
cpdef Tokens tokenize(unicode string)
 | 
			
		||||
cpdef unicode unhash(StringHash hash_value)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										73
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										73
									
								
								spacy/en.pyx
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -26,10 +26,8 @@ scheme in several important respects:
 | 
			
		|||
 | 
			
		||||
Take care to ensure you training and run-time data is tokenized according to the
 | 
			
		||||
same scheme. Tokenization problems are a major cause of poor performance for
 | 
			
		||||
NLP tools.
 | 
			
		||||
 | 
			
		||||
If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
 | 
			
		||||
Treebank 3-compliant tokenizer.
 | 
			
		||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 | 
			
		||||
provides a fully Penn Treebank 3-compliant tokenizer.
 | 
			
		||||
'''
 | 
			
		||||
#The script translate_treebank_tokenization can be used to transform a treebank's
 | 
			
		||||
#annotation to use one of the spacy tokenization schemes.
 | 
			
		||||
| 
						 | 
				
			
			@ -53,8 +51,12 @@ from .lexeme import *
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cdef class English(spacy.Language):
 | 
			
		||||
    cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
 | 
			
		||||
        pass
 | 
			
		||||
    # How to ensure the order here aligns with orthography.latin?
 | 
			
		||||
    view_funcs = [
 | 
			
		||||
        get_normalized,
 | 
			
		||||
        get_word_shape,
 | 
			
		||||
        get_last3
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    cdef int find_split(self, unicode word):
 | 
			
		||||
        cdef size_t length = len(word)
 | 
			
		||||
| 
						 | 
				
			
			@ -74,6 +76,27 @@ cdef class English(spacy.Language):
 | 
			
		|||
                i += 1
 | 
			
		||||
        return i
 | 
			
		||||
 | 
			
		||||
    cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
 | 
			
		||||
        cdef Lexeme* w = <Lexeme*>lex_id
 | 
			
		||||
        if attr == LEX:
 | 
			
		||||
            return <AttrType>w.lex
 | 
			
		||||
        elif attr == FIRST:
 | 
			
		||||
            return w.string[0]
 | 
			
		||||
        elif attr == LENGTH:
 | 
			
		||||
            return w.length
 | 
			
		||||
        elif attr == CLUSTER:
 | 
			
		||||
            return w.cluster
 | 
			
		||||
        elif attr == NORM:
 | 
			
		||||
            return w.string_views[0]
 | 
			
		||||
        elif attr == SHAPE:
 | 
			
		||||
            return w.string_views[1]
 | 
			
		||||
        elif attr == LAST3:
 | 
			
		||||
            return w.string_views[2]
 | 
			
		||||
        else:
 | 
			
		||||
            raise AttributeError(attr)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef bint check_punct(unicode word, size_t i, size_t length):
 | 
			
		||||
    # Don't count appostrophes as punct if the next char is a letter
 | 
			
		||||
| 
						 | 
				
			
			@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
 | 
			
		|||
    return EN.tokenize(string)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# +49 151 4336 2587
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef LexID lookup(unicode string) except 0:
 | 
			
		||||
    """Retrieve (or create, if not found) a Lexeme ID for a string.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
 | 
			
		|||
    Returns:
 | 
			
		||||
        lexeme (LexID): A reference to a lexical type.
 | 
			
		||||
    """
 | 
			
		||||
    return <Lexeme_addr>EN.lookup(string)
 | 
			
		||||
    return <LexID>EN.lookup(string)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef unicode unhash(StringHash hash_value):
 | 
			
		||||
| 
						 | 
				
			
			@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
 | 
			
		|||
        string (unicode): A unicode string that hashes to the hash_value.
 | 
			
		||||
    """
 | 
			
		||||
    return EN.unhash(hash_value)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_string_views(view_funcs):
 | 
			
		||||
    """Add a string view to existing and previous lexical entries.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        get_view (function): A unicode --> unicode function.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        view_id (int): An integer key you can use to access the view.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_clusters(location):
 | 
			
		||||
    """Load cluster data.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
def load_unigram_probs(location):
 | 
			
		||||
    """Load unigram probabilities.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
def load_case_stats(location):
 | 
			
		||||
    """Load case stats.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
def load_tag_stats(location):
 | 
			
		||||
    """Load tag statistics.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
from libc.stdint cimport uint32_t
 | 
			
		||||
from libc.stdint cimport uint64_t
 | 
			
		||||
 | 
			
		||||
cimport cython
 | 
			
		||||
 | 
			
		||||
ctypedef int ClusterID
 | 
			
		||||
ctypedef uint32_t StringHash
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,10 +10,9 @@ cdef enum OrthFlag:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cdef enum:
 | 
			
		||||
    LEX
 | 
			
		||||
    LAST3
 | 
			
		||||
    NORM
 | 
			
		||||
    SHAPE
 | 
			
		||||
    LAST3
 | 
			
		||||
 | 
			
		||||
from spacy.lexeme cimport LexID
 | 
			
		||||
from spacy.lexeme cimport StringHash
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,38 @@
 | 
			
		|||
# cython: embedsignature=True
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
 | 
			
		||||
def get_normalized(unicode lex):
 | 
			
		||||
    if lex.isalpha() and lex.islower():
 | 
			
		||||
        return lex
 | 
			
		||||
def get_normalized(unicode word):
 | 
			
		||||
    """Todo.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        word (unicode)
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        normalized (unicode)
 | 
			
		||||
    """
 | 
			
		||||
    if word.isalpha() and word.islower():
 | 
			
		||||
        return word
 | 
			
		||||
    else:
 | 
			
		||||
        return get_word_shape(lex)
 | 
			
		||||
        return get_word_shape(word)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_word_shape(unicode lex):
 | 
			
		||||
    cdef size_t length = len(lex)
 | 
			
		||||
def get_word_shape(unicode word):
 | 
			
		||||
    """Todo.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        word (unicode)
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        shape (unicode)
 | 
			
		||||
    """
 | 
			
		||||
    cdef size_t length = len(word)
 | 
			
		||||
    shape = ""
 | 
			
		||||
    last = ""
 | 
			
		||||
    shape_char = ""
 | 
			
		||||
    seq = 0
 | 
			
		||||
    for c in lex:
 | 
			
		||||
    for c in word:
 | 
			
		||||
        if c.isalpha():
 | 
			
		||||
            if c.isupper():
 | 
			
		||||
                shape_char = "X"
 | 
			
		||||
| 
						 | 
				
			
			@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
 | 
			
		|||
    return shape
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef unicode get_last3(unicode string):
 | 
			
		||||
    return string[-3:]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef bint is_alpha(LexID lex_id) except *:
 | 
			
		||||
    """Give the result of unicode.isalpha() for a Lexeme ID.
 | 
			
		||||
    """Check whether all characters in the word's string are alphabetic.
 | 
			
		||||
    
 | 
			
		||||
    Should match the :py:func:`unicode.isalpha()` function.
 | 
			
		||||
 | 
			
		||||
    >>> is_alpha(lookup(u'Hello'))
 | 
			
		||||
    True
 | 
			
		||||
| 
						 | 
				
			
			@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cpdef bint is_digit(LexID lex_id) except *:
 | 
			
		||||
    """Give the result of unicode.isdigit() for a Lexeme ID.
 | 
			
		||||
    """Check whether all characters in the word's string are numeric.
 | 
			
		||||
    
 | 
			
		||||
    Should match the :py:func:`unicode.isdigit()` function.
 | 
			
		||||
 | 
			
		||||
    >>> is_digit(lookup(u'10'))
 | 
			
		||||
    True
 | 
			
		||||
| 
						 | 
				
			
			@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cpdef bint is_punct(LexID lex_id) except *:
 | 
			
		||||
    """Give the result of checking whether all characters belong to a punctuation
 | 
			
		||||
    unicode data category for a Lexeme ID.
 | 
			
		||||
    """Check whether all characters belong to a punctuation unicode data category
 | 
			
		||||
    for a Lexeme ID.
 | 
			
		||||
 | 
			
		||||
    >>> is_punct(lookup(u'.'))
 | 
			
		||||
    True
 | 
			
		||||
| 
						 | 
				
			
			@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
 | 
			
		|||
cpdef bint is_space(LexID lex_id) except *:
 | 
			
		||||
    """Give the result of unicode.isspace() for a Lexeme ID.
 | 
			
		||||
 | 
			
		||||
    >>> is_space(lookup(u'\t'))
 | 
			
		||||
    >>> is_space(lookup(u'\\t'))
 | 
			
		||||
    True
 | 
			
		||||
    >>> is_space(lookup(u'<unicode space>'))
 | 
			
		||||
    True
 | 
			
		||||
    >>> is_space(lookup(u'Hi\n'))
 | 
			
		||||
    >>> is_space(lookup(u'Hi\\n'))
 | 
			
		||||
    False
 | 
			
		||||
    """
 | 
			
		||||
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
 | 
			
		||||
| 
						 | 
				
			
			@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
 | 
			
		|||
    """Return the hash of a "normalized" version of the string.
 | 
			
		||||
 | 
			
		||||
    Normalized strings are intended to be less sparse, while still capturing
 | 
			
		||||
    important lexical information.  See spacy.latin.orthography.normalize_string for details of the normalization
 | 
			
		||||
    function.
 | 
			
		||||
    important lexical information.  See :py:func:`spacy.latin.orthography.normalize_string`
 | 
			
		||||
    for details of the normalization function.
 | 
			
		||||
 | 
			
		||||
    >>> unhash(norm_of(lookupu'Hi'))
 | 
			
		||||
    u'hi'
 | 
			
		||||
| 
						 | 
				
			
			@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
 | 
			
		|||
cpdef StringHash shape_of(LexID lex_id) except 0:
 | 
			
		||||
    """Return the hash of a string describing the word's "orthograpgic shape".
 | 
			
		||||
 | 
			
		||||
    Orthographic shapes are calculated by the spacy.orthography.latin.string_shape
 | 
			
		||||
    Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
 | 
			
		||||
    function. Word shape features have been found useful for NER and POS tagging,
 | 
			
		||||
    e.g. Manning (2011)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -24,6 +24,7 @@ TAGS = {}
 | 
			
		|||
DIST_FLAGS = {}
 | 
			
		||||
 | 
			
		||||
cdef class Language:
 | 
			
		||||
    view_funcs = []
 | 
			
		||||
    def __cinit__(self, name):
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.bacov = {}
 | 
			
		||||
| 
						 | 
				
			
			@ -90,13 +91,41 @@ cdef class Language:
 | 
			
		|||
        cdef bytes byte_string = string.encode('utf8')
 | 
			
		||||
        word.string = <char*>byte_string
 | 
			
		||||
        word.length = len(byte_string)
 | 
			
		||||
        self.set_orth(string, word)
 | 
			
		||||
 | 
			
		||||
        word.lex = hash(string)
 | 
			
		||||
        word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
 | 
			
		||||
        cdef unicode view
 | 
			
		||||
        cdef StringHash hashed
 | 
			
		||||
        for i, view_func in enumerate(self.view_funcs):
 | 
			
		||||
            view = view_func(string)
 | 
			
		||||
            hashed = hash(view)
 | 
			
		||||
            word.string_views[i] = hashed
 | 
			
		||||
            self.bacov[hashed] = view
 | 
			
		||||
        self.bacov[word.lex] = string
 | 
			
		||||
        self.vocab[word.lex] = <LexID>word
 | 
			
		||||
        return word
 | 
			
		||||
 | 
			
		||||
    def add_view_funcs(self, list view_funcs):
 | 
			
		||||
        self.view_funcs.extend(view_funcs)
 | 
			
		||||
        cdef size_t nr_views = len(self.view_funcs)
 | 
			
		||||
 | 
			
		||||
        cdef unicode view
 | 
			
		||||
        cdef StringHash hashed
 | 
			
		||||
        cdef StringHash key
 | 
			
		||||
        cdef unicode string
 | 
			
		||||
        cdef LexID lex_id
 | 
			
		||||
        cdef Lexeme* word
 | 
			
		||||
 | 
			
		||||
        for key, lex_id in self.vocab:
 | 
			
		||||
            word = <Lexeme*>lex_id
 | 
			
		||||
            free(word.string_views)
 | 
			
		||||
            word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
 | 
			
		||||
            string = word.string[:word.length].decode('utf8')
 | 
			
		||||
            for i, view_func in enumerate(self.view_funcs):
 | 
			
		||||
                view = view_func(string)
 | 
			
		||||
                hashed = hash(view)
 | 
			
		||||
                word.string_views[i] = hashed
 | 
			
		||||
                self.bacov[hashed] = view
 | 
			
		||||
 | 
			
		||||
    cpdef unicode unhash(self, StringHash hash_value):
 | 
			
		||||
        '''Fetch a string from the reverse index, given its hash value.'''
 | 
			
		||||
        return self.bacov[hash_value]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
from spacy.lexeme cimport Lexeme
 | 
			
		||||
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
 | 
			
		||||
from spacy.spacy cimport StringHash
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -66,8 +65,7 @@ cdef class Tokens:
 | 
			
		|||
        cdef StringHash key
 | 
			
		||||
        cdef Lexeme_addr t
 | 
			
		||||
        for t in self.vctr[0]:
 | 
			
		||||
            #key = attr_of(t, attr)
 | 
			
		||||
            key = 0
 | 
			
		||||
            key = self.lang.attr_of(t, attr)
 | 
			
		||||
            if key in indices:
 | 
			
		||||
                groups[indices[key]].append(t)
 | 
			
		||||
            else:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user