mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
* Struggling with arbitrary attr access...
This commit is contained in:
parent
314658b31c
commit
811b7a6b91
|
@ -1,5 +0,0 @@
|
||||||
Cython API
|
|
||||||
==========
|
|
||||||
|
|
||||||
Cheat Sheet
|
|
||||||
-----------
|
|
|
@ -1,2 +0,0 @@
|
||||||
Adding a Language
|
|
||||||
=================
|
|
|
@ -1,45 +0,0 @@
|
||||||
Python API
|
|
||||||
==========
|
|
||||||
|
|
||||||
.. py:currentmodule:: spacy.en
|
|
||||||
|
|
||||||
To and from unicode strings
|
|
||||||
---------------------------
|
|
||||||
|
|
||||||
.. autofunction:: tokenize
|
|
||||||
.. autofunction:: lookup
|
|
||||||
.. autofunction:: unhash
|
|
||||||
|
|
||||||
Access (Hashed) String Views
|
|
||||||
----------------------------
|
|
||||||
|
|
||||||
.. autofunction:: lex_of
|
|
||||||
.. autofunction:: norm_of
|
|
||||||
.. autofunction:: shape_of
|
|
||||||
.. autofunction:: last3_of
|
|
||||||
|
|
||||||
Access String Properties
|
|
||||||
------------------------
|
|
||||||
|
|
||||||
.. autofunction:: length_of
|
|
||||||
.. autofunction:: first_of
|
|
||||||
|
|
||||||
Check Orthographic Flags
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
.. autofunction:: is_alpha
|
|
||||||
.. autofunction:: is_digit
|
|
||||||
.. autofunction:: is_punct
|
|
||||||
.. autofunction:: is_space
|
|
||||||
.. autofunction:: is_lower
|
|
||||||
.. autofunction:: is_upper
|
|
||||||
.. autofunction:: is_title
|
|
||||||
.. autofunction:: is_ascii
|
|
||||||
|
|
||||||
Access Distributional Information
|
|
||||||
---------------------------------
|
|
||||||
|
|
||||||
.. autofunction:: prob_of
|
|
||||||
.. autofunction:: cluster_of
|
|
||||||
.. autofunction:: check_tag_flag
|
|
||||||
.. autofunction:: check_dist_flag
|
|
26
spacy/en.pxd
26
spacy/en.pxd
|
@ -1,19 +1,39 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
from spacy.spacy cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.spacy cimport Lexeme_addr
|
from spacy.lexeme cimport LexID
|
||||||
|
from spacy.lexeme cimport ClusterID
|
||||||
|
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
|
ctypedef fused AttrType:
|
||||||
|
ClusterID
|
||||||
|
StringHash
|
||||||
|
cython.char
|
||||||
|
|
||||||
|
|
||||||
|
cdef enum AttrName:
|
||||||
|
LEX
|
||||||
|
FIRST
|
||||||
|
LENGTH
|
||||||
|
CLUSTER
|
||||||
|
NORM
|
||||||
|
SHAPE
|
||||||
|
LAST3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef int find_split(self, unicode word)
|
cdef int find_split(self, unicode word)
|
||||||
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
|
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
|
||||||
|
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
|
||||||
|
|
||||||
cdef English EN
|
cdef English EN
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode word) except 0
|
cpdef LexID lookup(unicode word) except 0
|
||||||
cpdef Tokens tokenize(unicode string)
|
cpdef Tokens tokenize(unicode string)
|
||||||
cpdef unicode unhash(StringHash hash_value)
|
cpdef unicode unhash(StringHash hash_value)
|
||||||
|
|
73
spacy/en.pyx
73
spacy/en.pyx
|
@ -26,10 +26,8 @@ scheme in several important respects:
|
||||||
|
|
||||||
Take care to ensure you training and run-time data is tokenized according to the
|
Take care to ensure you training and run-time data is tokenized according to the
|
||||||
same scheme. Tokenization problems are a major cause of poor performance for
|
same scheme. Tokenization problems are a major cause of poor performance for
|
||||||
NLP tools.
|
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||||
|
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||||
If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
|
|
||||||
Treebank 3-compliant tokenizer.
|
|
||||||
'''
|
'''
|
||||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
#The script translate_treebank_tokenization can be used to transform a treebank's
|
||||||
#annotation to use one of the spacy tokenization schemes.
|
#annotation to use one of the spacy tokenization schemes.
|
||||||
|
@ -53,8 +51,12 @@ from .lexeme import *
|
||||||
|
|
||||||
|
|
||||||
cdef class English(spacy.Language):
|
cdef class English(spacy.Language):
|
||||||
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
|
# How to ensure the order here aligns with orthography.latin?
|
||||||
pass
|
view_funcs = [
|
||||||
|
get_normalized,
|
||||||
|
get_word_shape,
|
||||||
|
get_last3
|
||||||
|
]
|
||||||
|
|
||||||
cdef int find_split(self, unicode word):
|
cdef int find_split(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
|
@ -74,6 +76,27 @@ cdef class English(spacy.Language):
|
||||||
i += 1
|
i += 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
|
||||||
|
cdef Lexeme* w = <Lexeme*>lex_id
|
||||||
|
if attr == LEX:
|
||||||
|
return <AttrType>w.lex
|
||||||
|
elif attr == FIRST:
|
||||||
|
return w.string[0]
|
||||||
|
elif attr == LENGTH:
|
||||||
|
return w.length
|
||||||
|
elif attr == CLUSTER:
|
||||||
|
return w.cluster
|
||||||
|
elif attr == NORM:
|
||||||
|
return w.string_views[0]
|
||||||
|
elif attr == SHAPE:
|
||||||
|
return w.string_views[1]
|
||||||
|
elif attr == LAST3:
|
||||||
|
return w.string_views[2]
|
||||||
|
else:
|
||||||
|
raise AttributeError(attr)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint check_punct(unicode word, size_t i, size_t length):
|
cdef bint check_punct(unicode word, size_t i, size_t length):
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
|
@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
|
||||||
return EN.tokenize(string)
|
return EN.tokenize(string)
|
||||||
|
|
||||||
|
|
||||||
# +49 151 4336 2587
|
|
||||||
|
|
||||||
|
|
||||||
cpdef LexID lookup(unicode string) except 0:
|
cpdef LexID lookup(unicode string) except 0:
|
||||||
"""Retrieve (or create, if not found) a Lexeme ID for a string.
|
"""Retrieve (or create, if not found) a Lexeme ID for a string.
|
||||||
|
|
||||||
|
@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (LexID): A reference to a lexical type.
|
lexeme (LexID): A reference to a lexical type.
|
||||||
"""
|
"""
|
||||||
return <Lexeme_addr>EN.lookup(string)
|
return <LexID>EN.lookup(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
|
||||||
string (unicode): A unicode string that hashes to the hash_value.
|
string (unicode): A unicode string that hashes to the hash_value.
|
||||||
"""
|
"""
|
||||||
return EN.unhash(hash_value)
|
return EN.unhash(hash_value)
|
||||||
|
|
||||||
|
|
||||||
|
def add_string_views(view_funcs):
|
||||||
|
"""Add a string view to existing and previous lexical entries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
get_view (function): A unicode --> unicode function.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
view_id (int): An integer key you can use to access the view.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def load_clusters(location):
|
||||||
|
"""Load cluster data.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_unigram_probs(location):
|
||||||
|
"""Load unigram probabilities.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_case_stats(location):
|
||||||
|
"""Load case stats.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_tag_stats(location):
|
||||||
|
"""Load tag statistics.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
cimport cython
|
||||||
|
|
||||||
ctypedef int ClusterID
|
ctypedef int ClusterID
|
||||||
ctypedef uint32_t StringHash
|
ctypedef uint32_t StringHash
|
||||||
|
|
|
@ -10,10 +10,9 @@ cdef enum OrthFlag:
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
LEX
|
|
||||||
LAST3
|
|
||||||
NORM
|
NORM
|
||||||
SHAPE
|
SHAPE
|
||||||
|
LAST3
|
||||||
|
|
||||||
from spacy.lexeme cimport LexID
|
from spacy.lexeme cimport LexID
|
||||||
from spacy.lexeme cimport StringHash
|
from spacy.lexeme cimport StringHash
|
||||||
|
|
|
@ -1,20 +1,38 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
def get_normalized(unicode lex):
|
def get_normalized(unicode word):
|
||||||
if lex.isalpha() and lex.islower():
|
"""Todo.
|
||||||
return lex
|
|
||||||
|
Args:
|
||||||
|
word (unicode)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
normalized (unicode)
|
||||||
|
"""
|
||||||
|
if word.isalpha() and word.islower():
|
||||||
|
return word
|
||||||
else:
|
else:
|
||||||
return get_word_shape(lex)
|
return get_word_shape(word)
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(unicode lex):
|
def get_word_shape(unicode word):
|
||||||
cdef size_t length = len(lex)
|
"""Todo.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word (unicode)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
shape (unicode)
|
||||||
|
"""
|
||||||
|
cdef size_t length = len(word)
|
||||||
shape = ""
|
shape = ""
|
||||||
last = ""
|
last = ""
|
||||||
shape_char = ""
|
shape_char = ""
|
||||||
seq = 0
|
seq = 0
|
||||||
for c in lex:
|
for c in word:
|
||||||
if c.isalpha():
|
if c.isalpha():
|
||||||
if c.isupper():
|
if c.isupper():
|
||||||
shape_char = "X"
|
shape_char = "X"
|
||||||
|
@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
|
||||||
return shape
|
return shape
|
||||||
|
|
||||||
|
|
||||||
|
cpdef unicode get_last3(unicode string):
|
||||||
|
return string[-3:]
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_alpha(LexID lex_id) except *:
|
cpdef bint is_alpha(LexID lex_id) except *:
|
||||||
"""Give the result of unicode.isalpha() for a Lexeme ID.
|
"""Check whether all characters in the word's string are alphabetic.
|
||||||
|
|
||||||
|
Should match the :py:func:`unicode.isalpha()` function.
|
||||||
|
|
||||||
>>> is_alpha(lookup(u'Hello'))
|
>>> is_alpha(lookup(u'Hello'))
|
||||||
True
|
True
|
||||||
|
@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_digit(LexID lex_id) except *:
|
cpdef bint is_digit(LexID lex_id) except *:
|
||||||
"""Give the result of unicode.isdigit() for a Lexeme ID.
|
"""Check whether all characters in the word's string are numeric.
|
||||||
|
|
||||||
|
Should match the :py:func:`unicode.isdigit()` function.
|
||||||
|
|
||||||
>>> is_digit(lookup(u'10'))
|
>>> is_digit(lookup(u'10'))
|
||||||
True
|
True
|
||||||
|
@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_punct(LexID lex_id) except *:
|
cpdef bint is_punct(LexID lex_id) except *:
|
||||||
"""Give the result of checking whether all characters belong to a punctuation
|
"""Check whether all characters belong to a punctuation unicode data category
|
||||||
unicode data category for a Lexeme ID.
|
for a Lexeme ID.
|
||||||
|
|
||||||
>>> is_punct(lookup(u'.'))
|
>>> is_punct(lookup(u'.'))
|
||||||
True
|
True
|
||||||
|
@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
|
||||||
cpdef bint is_space(LexID lex_id) except *:
|
cpdef bint is_space(LexID lex_id) except *:
|
||||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
"""Give the result of unicode.isspace() for a Lexeme ID.
|
||||||
|
|
||||||
>>> is_space(lookup(u'\t'))
|
>>> is_space(lookup(u'\\t'))
|
||||||
True
|
True
|
||||||
>>> is_space(lookup(u'<unicode space>'))
|
>>> is_space(lookup(u'<unicode space>'))
|
||||||
True
|
True
|
||||||
>>> is_space(lookup(u'Hi\n'))
|
>>> is_space(lookup(u'Hi\\n'))
|
||||||
False
|
False
|
||||||
"""
|
"""
|
||||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
||||||
|
@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
|
||||||
"""Return the hash of a "normalized" version of the string.
|
"""Return the hash of a "normalized" version of the string.
|
||||||
|
|
||||||
Normalized strings are intended to be less sparse, while still capturing
|
Normalized strings are intended to be less sparse, while still capturing
|
||||||
important lexical information. See spacy.latin.orthography.normalize_string for details of the normalization
|
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
|
||||||
function.
|
for details of the normalization function.
|
||||||
|
|
||||||
>>> unhash(norm_of(lookupu'Hi'))
|
>>> unhash(norm_of(lookupu'Hi'))
|
||||||
u'hi'
|
u'hi'
|
||||||
|
@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
|
||||||
cpdef StringHash shape_of(LexID lex_id) except 0:
|
cpdef StringHash shape_of(LexID lex_id) except 0:
|
||||||
"""Return the hash of a string describing the word's "orthograpgic shape".
|
"""Return the hash of a string describing the word's "orthograpgic shape".
|
||||||
|
|
||||||
Orthographic shapes are calculated by the spacy.orthography.latin.string_shape
|
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
|
||||||
function. Word shape features have been found useful for NER and POS tagging,
|
function. Word shape features have been found useful for NER and POS tagging,
|
||||||
e.g. Manning (2011)
|
e.g. Manning (2011)
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ TAGS = {}
|
||||||
DIST_FLAGS = {}
|
DIST_FLAGS = {}
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
view_funcs = []
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
|
@ -90,13 +91,41 @@ cdef class Language:
|
||||||
cdef bytes byte_string = string.encode('utf8')
|
cdef bytes byte_string = string.encode('utf8')
|
||||||
word.string = <char*>byte_string
|
word.string = <char*>byte_string
|
||||||
word.length = len(byte_string)
|
word.length = len(byte_string)
|
||||||
self.set_orth(string, word)
|
|
||||||
|
|
||||||
word.lex = hash(string)
|
word.lex = hash(string)
|
||||||
|
word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
|
||||||
|
cdef unicode view
|
||||||
|
cdef StringHash hashed
|
||||||
|
for i, view_func in enumerate(self.view_funcs):
|
||||||
|
view = view_func(string)
|
||||||
|
hashed = hash(view)
|
||||||
|
word.string_views[i] = hashed
|
||||||
|
self.bacov[hashed] = view
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
self.vocab[word.lex] = <LexID>word
|
self.vocab[word.lex] = <LexID>word
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
def add_view_funcs(self, list view_funcs):
|
||||||
|
self.view_funcs.extend(view_funcs)
|
||||||
|
cdef size_t nr_views = len(self.view_funcs)
|
||||||
|
|
||||||
|
cdef unicode view
|
||||||
|
cdef StringHash hashed
|
||||||
|
cdef StringHash key
|
||||||
|
cdef unicode string
|
||||||
|
cdef LexID lex_id
|
||||||
|
cdef Lexeme* word
|
||||||
|
|
||||||
|
for key, lex_id in self.vocab:
|
||||||
|
word = <Lexeme*>lex_id
|
||||||
|
free(word.string_views)
|
||||||
|
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
|
||||||
|
string = word.string[:word.length].decode('utf8')
|
||||||
|
for i, view_func in enumerate(self.view_funcs):
|
||||||
|
view = view_func(string)
|
||||||
|
hashed = hash(view)
|
||||||
|
word.string_views[i] = hashed
|
||||||
|
self.bacov[hashed] = view
|
||||||
|
|
||||||
cpdef unicode unhash(self, StringHash hash_value):
|
cpdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return self.bacov[hash_value]
|
return self.bacov[hash_value]
|
||||||
|
|
|
@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,8 +65,7 @@ cdef class Tokens:
|
||||||
cdef StringHash key
|
cdef StringHash key
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
#key = attr_of(t, attr)
|
key = self.lang.attr_of(t, attr)
|
||||||
key = 0
|
|
||||||
if key in indices:
|
if key in indices:
|
||||||
groups[indices[key]].append(t)
|
groups[indices[key]].append(t)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user