mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Struggling with arbitrary attr access...
This commit is contained in:
parent
314658b31c
commit
811b7a6b91
|
@ -1,5 +0,0 @@
|
|||
Cython API
|
||||
==========
|
||||
|
||||
Cheat Sheet
|
||||
-----------
|
|
@ -1,2 +0,0 @@
|
|||
Adding a Language
|
||||
=================
|
|
@ -1,45 +0,0 @@
|
|||
Python API
|
||||
==========
|
||||
|
||||
.. py:currentmodule:: spacy.en
|
||||
|
||||
To and from unicode strings
|
||||
---------------------------
|
||||
|
||||
.. autofunction:: tokenize
|
||||
.. autofunction:: lookup
|
||||
.. autofunction:: unhash
|
||||
|
||||
Access (Hashed) String Views
|
||||
----------------------------
|
||||
|
||||
.. autofunction:: lex_of
|
||||
.. autofunction:: norm_of
|
||||
.. autofunction:: shape_of
|
||||
.. autofunction:: last3_of
|
||||
|
||||
Access String Properties
|
||||
------------------------
|
||||
|
||||
.. autofunction:: length_of
|
||||
.. autofunction:: first_of
|
||||
|
||||
Check Orthographic Flags
|
||||
-------------------------
|
||||
|
||||
.. autofunction:: is_alpha
|
||||
.. autofunction:: is_digit
|
||||
.. autofunction:: is_punct
|
||||
.. autofunction:: is_space
|
||||
.. autofunction:: is_lower
|
||||
.. autofunction:: is_upper
|
||||
.. autofunction:: is_title
|
||||
.. autofunction:: is_ascii
|
||||
|
||||
Access Distributional Information
|
||||
---------------------------------
|
||||
|
||||
.. autofunction:: prob_of
|
||||
.. autofunction:: cluster_of
|
||||
.. autofunction:: check_tag_flag
|
||||
.. autofunction:: check_dist_flag
|
26
spacy/en.pxd
26
spacy/en.pxd
|
@ -1,19 +1,39 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.spacy cimport StringHash
|
||||
from spacy.spacy cimport Lexeme
|
||||
from spacy.spacy cimport Lexeme_addr
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport LexID
|
||||
from spacy.lexeme cimport ClusterID
|
||||
|
||||
from spacy.spacy cimport Language
|
||||
from spacy.tokens cimport Tokens
|
||||
cimport cython
|
||||
|
||||
|
||||
ctypedef fused AttrType:
|
||||
ClusterID
|
||||
StringHash
|
||||
cython.char
|
||||
|
||||
|
||||
cdef enum AttrName:
|
||||
LEX
|
||||
FIRST
|
||||
LENGTH
|
||||
CLUSTER
|
||||
NORM
|
||||
SHAPE
|
||||
LAST3
|
||||
|
||||
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int find_split(self, unicode word)
|
||||
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
|
||||
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
|
||||
|
||||
cdef English EN
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode word) except 0
|
||||
cpdef LexID lookup(unicode word) except 0
|
||||
cpdef Tokens tokenize(unicode string)
|
||||
cpdef unicode unhash(StringHash hash_value)
|
||||
|
|
73
spacy/en.pyx
73
spacy/en.pyx
|
@ -26,10 +26,8 @@ scheme in several important respects:
|
|||
|
||||
Take care to ensure you training and run-time data is tokenized according to the
|
||||
same scheme. Tokenization problems are a major cause of poor performance for
|
||||
NLP tools.
|
||||
|
||||
If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
|
||||
Treebank 3-compliant tokenizer.
|
||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||
'''
|
||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
||||
#annotation to use one of the spacy tokenization schemes.
|
||||
|
@ -53,8 +51,12 @@ from .lexeme import *
|
|||
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
|
||||
pass
|
||||
# How to ensure the order here aligns with orthography.latin?
|
||||
view_funcs = [
|
||||
get_normalized,
|
||||
get_word_shape,
|
||||
get_last3
|
||||
]
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
|
@ -74,6 +76,27 @@ cdef class English(spacy.Language):
|
|||
i += 1
|
||||
return i
|
||||
|
||||
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
|
||||
cdef Lexeme* w = <Lexeme*>lex_id
|
||||
if attr == LEX:
|
||||
return <AttrType>w.lex
|
||||
elif attr == FIRST:
|
||||
return w.string[0]
|
||||
elif attr == LENGTH:
|
||||
return w.length
|
||||
elif attr == CLUSTER:
|
||||
return w.cluster
|
||||
elif attr == NORM:
|
||||
return w.string_views[0]
|
||||
elif attr == SHAPE:
|
||||
return w.string_views[1]
|
||||
elif attr == LAST3:
|
||||
return w.string_views[2]
|
||||
else:
|
||||
raise AttributeError(attr)
|
||||
|
||||
|
||||
|
||||
|
||||
cdef bint check_punct(unicode word, size_t i, size_t length):
|
||||
# Don't count appostrophes as punct if the next char is a letter
|
||||
|
@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
|
|||
return EN.tokenize(string)
|
||||
|
||||
|
||||
# +49 151 4336 2587
|
||||
|
||||
|
||||
cpdef LexID lookup(unicode string) except 0:
|
||||
"""Retrieve (or create, if not found) a Lexeme ID for a string.
|
||||
|
||||
|
@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
|
|||
Returns:
|
||||
lexeme (LexID): A reference to a lexical type.
|
||||
"""
|
||||
return <Lexeme_addr>EN.lookup(string)
|
||||
return <LexID>EN.lookup(string)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
|
@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
|
|||
string (unicode): A unicode string that hashes to the hash_value.
|
||||
"""
|
||||
return EN.unhash(hash_value)
|
||||
|
||||
|
||||
def add_string_views(view_funcs):
|
||||
"""Add a string view to existing and previous lexical entries.
|
||||
|
||||
Args:
|
||||
get_view (function): A unicode --> unicode function.
|
||||
|
||||
Returns:
|
||||
view_id (int): An integer key you can use to access the view.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def load_clusters(location):
|
||||
"""Load cluster data.
|
||||
"""
|
||||
pass
|
||||
|
||||
def load_unigram_probs(location):
|
||||
"""Load unigram probabilities.
|
||||
"""
|
||||
pass
|
||||
|
||||
def load_case_stats(location):
|
||||
"""Load case stats.
|
||||
"""
|
||||
pass
|
||||
|
||||
def load_tag_stats(location):
|
||||
"""Load tag statistics.
|
||||
"""
|
||||
pass
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
cimport cython
|
||||
|
||||
ctypedef int ClusterID
|
||||
ctypedef uint32_t StringHash
|
||||
|
|
|
@ -10,10 +10,9 @@ cdef enum OrthFlag:
|
|||
|
||||
|
||||
cdef enum:
|
||||
LEX
|
||||
LAST3
|
||||
NORM
|
||||
SHAPE
|
||||
LAST3
|
||||
|
||||
from spacy.lexeme cimport LexID
|
||||
from spacy.lexeme cimport StringHash
|
||||
|
|
|
@ -1,20 +1,38 @@
|
|||
# cython: embedsignature=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
def get_normalized(unicode lex):
|
||||
if lex.isalpha() and lex.islower():
|
||||
return lex
|
||||
def get_normalized(unicode word):
|
||||
"""Todo.
|
||||
|
||||
Args:
|
||||
word (unicode)
|
||||
|
||||
Returns:
|
||||
normalized (unicode)
|
||||
"""
|
||||
if word.isalpha() and word.islower():
|
||||
return word
|
||||
else:
|
||||
return get_word_shape(lex)
|
||||
return get_word_shape(word)
|
||||
|
||||
|
||||
def get_word_shape(unicode lex):
|
||||
cdef size_t length = len(lex)
|
||||
def get_word_shape(unicode word):
|
||||
"""Todo.
|
||||
|
||||
Args:
|
||||
word (unicode)
|
||||
|
||||
Returns:
|
||||
shape (unicode)
|
||||
"""
|
||||
cdef size_t length = len(word)
|
||||
shape = ""
|
||||
last = ""
|
||||
shape_char = ""
|
||||
seq = 0
|
||||
for c in lex:
|
||||
for c in word:
|
||||
if c.isalpha():
|
||||
if c.isupper():
|
||||
shape_char = "X"
|
||||
|
@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
|
|||
return shape
|
||||
|
||||
|
||||
cpdef unicode get_last3(unicode string):
|
||||
return string[-3:]
|
||||
|
||||
|
||||
cpdef bint is_alpha(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isalpha() for a Lexeme ID.
|
||||
"""Check whether all characters in the word's string are alphabetic.
|
||||
|
||||
Should match the :py:func:`unicode.isalpha()` function.
|
||||
|
||||
>>> is_alpha(lookup(u'Hello'))
|
||||
True
|
||||
|
@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:
|
|||
|
||||
|
||||
cpdef bint is_digit(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isdigit() for a Lexeme ID.
|
||||
"""Check whether all characters in the word's string are numeric.
|
||||
|
||||
Should match the :py:func:`unicode.isdigit()` function.
|
||||
|
||||
>>> is_digit(lookup(u'10'))
|
||||
True
|
||||
|
@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:
|
|||
|
||||
|
||||
cpdef bint is_punct(LexID lex_id) except *:
|
||||
"""Give the result of checking whether all characters belong to a punctuation
|
||||
unicode data category for a Lexeme ID.
|
||||
"""Check whether all characters belong to a punctuation unicode data category
|
||||
for a Lexeme ID.
|
||||
|
||||
>>> is_punct(lookup(u'.'))
|
||||
True
|
||||
|
@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
|
|||
cpdef bint is_space(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
||||
|
||||
>>> is_space(lookup(u'\t'))
|
||||
>>> is_space(lookup(u'\\t'))
|
||||
True
|
||||
>>> is_space(lookup(u'<unicode space>'))
|
||||
True
|
||||
>>> is_space(lookup(u'Hi\n'))
|
||||
>>> is_space(lookup(u'Hi\\n'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
||||
|
@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
|
|||
"""Return the hash of a "normalized" version of the string.
|
||||
|
||||
Normalized strings are intended to be less sparse, while still capturing
|
||||
important lexical information. See spacy.latin.orthography.normalize_string for details of the normalization
|
||||
function.
|
||||
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
|
||||
for details of the normalization function.
|
||||
|
||||
>>> unhash(norm_of(lookupu'Hi'))
|
||||
u'hi'
|
||||
|
@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
|
|||
cpdef StringHash shape_of(LexID lex_id) except 0:
|
||||
"""Return the hash of a string describing the word's "orthograpgic shape".
|
||||
|
||||
Orthographic shapes are calculated by the spacy.orthography.latin.string_shape
|
||||
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
|
||||
function. Word shape features have been found useful for NER and POS tagging,
|
||||
e.g. Manning (2011)
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ TAGS = {}
|
|||
DIST_FLAGS = {}
|
||||
|
||||
cdef class Language:
|
||||
view_funcs = []
|
||||
def __cinit__(self, name):
|
||||
self.name = name
|
||||
self.bacov = {}
|
||||
|
@ -90,13 +91,41 @@ cdef class Language:
|
|||
cdef bytes byte_string = string.encode('utf8')
|
||||
word.string = <char*>byte_string
|
||||
word.length = len(byte_string)
|
||||
self.set_orth(string, word)
|
||||
|
||||
word.lex = hash(string)
|
||||
word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
|
||||
cdef unicode view
|
||||
cdef StringHash hashed
|
||||
for i, view_func in enumerate(self.view_funcs):
|
||||
view = view_func(string)
|
||||
hashed = hash(view)
|
||||
word.string_views[i] = hashed
|
||||
self.bacov[hashed] = view
|
||||
self.bacov[word.lex] = string
|
||||
self.vocab[word.lex] = <LexID>word
|
||||
return word
|
||||
|
||||
def add_view_funcs(self, list view_funcs):
|
||||
self.view_funcs.extend(view_funcs)
|
||||
cdef size_t nr_views = len(self.view_funcs)
|
||||
|
||||
cdef unicode view
|
||||
cdef StringHash hashed
|
||||
cdef StringHash key
|
||||
cdef unicode string
|
||||
cdef LexID lex_id
|
||||
cdef Lexeme* word
|
||||
|
||||
for key, lex_id in self.vocab:
|
||||
word = <Lexeme*>lex_id
|
||||
free(word.string_views)
|
||||
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
|
||||
string = word.string[:word.length].decode('utf8')
|
||||
for i, view_func in enumerate(self.view_funcs):
|
||||
view = view_func(string)
|
||||
hashed = hash(view)
|
||||
word.string_views[i] = hashed
|
||||
self.bacov[hashed] = view
|
||||
|
||||
cpdef unicode unhash(self, StringHash hash_value):
|
||||
'''Fetch a string from the reverse index, given its hash value.'''
|
||||
return self.bacov[hash_value]
|
||||
|
|
|
@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc
|
|||
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
|
||||
|
@ -66,8 +65,7 @@ cdef class Tokens:
|
|||
cdef StringHash key
|
||||
cdef Lexeme_addr t
|
||||
for t in self.vctr[0]:
|
||||
#key = attr_of(t, attr)
|
||||
key = 0
|
||||
key = self.lang.attr_of(t, attr)
|
||||
if key in indices:
|
||||
groups[indices[key]].append(t)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue
Block a user