* Struggling with arbitrary attr access...

This commit is contained in:
Matthew Honnibal 2014-08-21 23:49:14 +02:00
parent 314658b31c
commit 811b7a6b91
12 changed files with 162 additions and 89 deletions

View File

@ -1,5 +0,0 @@
Cython API
==========
Cheat Sheet
-----------

View File

@ -1,2 +0,0 @@
Adding a Language
=================

View File

@ -1,45 +0,0 @@
Python API
==========
.. py:currentmodule:: spacy.en
To and from unicode strings
---------------------------
.. autofunction:: tokenize
.. autofunction:: lookup
.. autofunction:: unhash
Access (Hashed) String Views
----------------------------
.. autofunction:: lex_of
.. autofunction:: norm_of
.. autofunction:: shape_of
.. autofunction:: last3_of
Access String Properties
------------------------
.. autofunction:: length_of
.. autofunction:: first_of
Check Orthographic Flags
-------------------------
.. autofunction:: is_alpha
.. autofunction:: is_digit
.. autofunction:: is_punct
.. autofunction:: is_space
.. autofunction:: is_lower
.. autofunction:: is_upper
.. autofunction:: is_title
.. autofunction:: is_ascii
Access Distributional Information
---------------------------------
.. autofunction:: prob_of
.. autofunction:: cluster_of
.. autofunction:: check_tag_flag
.. autofunction:: check_dist_flag

View File

View File

View File

@ -1,19 +1,39 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.spacy cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.spacy cimport Lexeme_addr from spacy.lexeme cimport LexID
from spacy.lexeme cimport ClusterID
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
cimport cython
ctypedef fused AttrType:
ClusterID
StringHash
cython.char
cdef enum AttrName:
LEX
FIRST
LENGTH
CLUSTER
NORM
SHAPE
LAST3
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode word, Lexeme* lex) except -1 cdef int set_orth(self, unicode word, Lexeme* lex) except -1
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
cdef English EN cdef English EN
cpdef Lexeme_addr lookup(unicode word) except 0 cpdef LexID lookup(unicode word) except 0
cpdef Tokens tokenize(unicode string) cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value) cpdef unicode unhash(StringHash hash_value)

View File

@ -26,10 +26,8 @@ scheme in several important respects:
Take care to ensure you training and run-time data is tokenized according to the Take care to ensure you training and run-time data is tokenized according to the
same scheme. Tokenization problems are a major cause of poor performance for same scheme. Tokenization problems are a major cause of poor performance for
NLP tools. NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer.
If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
Treebank 3-compliant tokenizer.
''' '''
#The script translate_treebank_tokenization can be used to transform a treebank's #The script translate_treebank_tokenization can be used to transform a treebank's
#annotation to use one of the spacy tokenization schemes. #annotation to use one of the spacy tokenization schemes.
@ -53,8 +51,12 @@ from .lexeme import *
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int set_orth(self, unicode word, Lexeme* lex) except -1: # How to ensure the order here aligns with orthography.latin?
pass view_funcs = [
get_normalized,
get_word_shape,
get_last3
]
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
@ -74,6 +76,27 @@ cdef class English(spacy.Language):
i += 1 i += 1
return i return i
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
cdef Lexeme* w = <Lexeme*>lex_id
if attr == LEX:
return <AttrType>w.lex
elif attr == FIRST:
return w.string[0]
elif attr == LENGTH:
return w.length
elif attr == CLUSTER:
return w.cluster
elif attr == NORM:
return w.string_views[0]
elif attr == SHAPE:
return w.string_views[1]
elif attr == LAST3:
return w.string_views[2]
else:
raise AttributeError(attr)
cdef bint check_punct(unicode word, size_t i, size_t length): cdef bint check_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter # Don't count appostrophes as punct if the next char is a letter
@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
return EN.tokenize(string) return EN.tokenize(string)
# +49 151 4336 2587
cpdef LexID lookup(unicode string) except 0: cpdef LexID lookup(unicode string) except 0:
"""Retrieve (or create, if not found) a Lexeme ID for a string. """Retrieve (or create, if not found) a Lexeme ID for a string.
@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
Returns: Returns:
lexeme (LexID): A reference to a lexical type. lexeme (LexID): A reference to a lexical type.
""" """
return <Lexeme_addr>EN.lookup(string) return <LexID>EN.lookup(string)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):
@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
string (unicode): A unicode string that hashes to the hash_value. string (unicode): A unicode string that hashes to the hash_value.
""" """
return EN.unhash(hash_value) return EN.unhash(hash_value)
def add_string_views(view_funcs):
"""Add a string view to existing and previous lexical entries.
Args:
get_view (function): A unicode --> unicode function.
Returns:
view_id (int): An integer key you can use to access the view.
"""
pass
def load_clusters(location):
"""Load cluster data.
"""
pass
def load_unigram_probs(location):
"""Load unigram probabilities.
"""
pass
def load_case_stats(location):
"""Load case stats.
"""
pass
def load_tag_stats(location):
"""Load tag statistics.
"""
pass

View File

@ -1,6 +1,6 @@
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
cimport cython
ctypedef int ClusterID ctypedef int ClusterID
ctypedef uint32_t StringHash ctypedef uint32_t StringHash

View File

@ -10,10 +10,9 @@ cdef enum OrthFlag:
cdef enum: cdef enum:
LEX
LAST3
NORM NORM
SHAPE SHAPE
LAST3
from spacy.lexeme cimport LexID from spacy.lexeme cimport LexID
from spacy.lexeme cimport StringHash from spacy.lexeme cimport StringHash

View File

@ -1,20 +1,38 @@
# cython: embedsignature=True # cython: embedsignature=True
from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
def get_normalized(unicode lex): def get_normalized(unicode word):
if lex.isalpha() and lex.islower(): """Todo.
return lex
Args:
word (unicode)
Returns:
normalized (unicode)
"""
if word.isalpha() and word.islower():
return word
else: else:
return get_word_shape(lex) return get_word_shape(word)
def get_word_shape(unicode lex): def get_word_shape(unicode word):
cdef size_t length = len(lex) """Todo.
Args:
word (unicode)
Returns:
shape (unicode)
"""
cdef size_t length = len(word)
shape = "" shape = ""
last = "" last = ""
shape_char = "" shape_char = ""
seq = 0 seq = 0
for c in lex: for c in word:
if c.isalpha(): if c.isalpha():
if c.isupper(): if c.isupper():
shape_char = "X" shape_char = "X"
@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
return shape return shape
cpdef unicode get_last3(unicode string):
return string[-3:]
cpdef bint is_alpha(LexID lex_id) except *: cpdef bint is_alpha(LexID lex_id) except *:
"""Give the result of unicode.isalpha() for a Lexeme ID. """Check whether all characters in the word's string are alphabetic.
Should match the :py:func:`unicode.isalpha()` function.
>>> is_alpha(lookup(u'Hello')) >>> is_alpha(lookup(u'Hello'))
True True
@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:
cpdef bint is_digit(LexID lex_id) except *: cpdef bint is_digit(LexID lex_id) except *:
"""Give the result of unicode.isdigit() for a Lexeme ID. """Check whether all characters in the word's string are numeric.
Should match the :py:func:`unicode.isdigit()` function.
>>> is_digit(lookup(u'10')) >>> is_digit(lookup(u'10'))
True True
@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:
cpdef bint is_punct(LexID lex_id) except *: cpdef bint is_punct(LexID lex_id) except *:
"""Give the result of checking whether all characters belong to a punctuation """Check whether all characters belong to a punctuation unicode data category
unicode data category for a Lexeme ID. for a Lexeme ID.
>>> is_punct(lookup(u'.')) >>> is_punct(lookup(u'.'))
True True
@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
cpdef bint is_space(LexID lex_id) except *: cpdef bint is_space(LexID lex_id) except *:
"""Give the result of unicode.isspace() for a Lexeme ID. """Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\t')) >>> is_space(lookup(u'\\t'))
True True
>>> is_space(lookup(u'<unicode space>')) >>> is_space(lookup(u'<unicode space>'))
True True
>>> is_space(lookup(u'Hi\n')) >>> is_space(lookup(u'Hi\\n'))
False False
""" """
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
"""Return the hash of a "normalized" version of the string. """Return the hash of a "normalized" version of the string.
Normalized strings are intended to be less sparse, while still capturing Normalized strings are intended to be less sparse, while still capturing
important lexical information. See spacy.latin.orthography.normalize_string for details of the normalization important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
function. for details of the normalization function.
>>> unhash(norm_of(lookupu'Hi')) >>> unhash(norm_of(lookupu'Hi'))
u'hi' u'hi'
@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
cpdef StringHash shape_of(LexID lex_id) except 0: cpdef StringHash shape_of(LexID lex_id) except 0:
"""Return the hash of a string describing the word's "orthograpgic shape". """Return the hash of a string describing the word's "orthograpgic shape".
Orthographic shapes are calculated by the spacy.orthography.latin.string_shape Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
function. Word shape features have been found useful for NER and POS tagging, function. Word shape features have been found useful for NER and POS tagging,
e.g. Manning (2011) e.g. Manning (2011)

View File

@ -24,6 +24,7 @@ TAGS = {}
DIST_FLAGS = {} DIST_FLAGS = {}
cdef class Language: cdef class Language:
view_funcs = []
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
@ -90,13 +91,41 @@ cdef class Language:
cdef bytes byte_string = string.encode('utf8') cdef bytes byte_string = string.encode('utf8')
word.string = <char*>byte_string word.string = <char*>byte_string
word.length = len(byte_string) word.length = len(byte_string)
self.set_orth(string, word)
word.lex = hash(string) word.lex = hash(string)
word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
cdef unicode view
cdef StringHash hashed
for i, view_func in enumerate(self.view_funcs):
view = view_func(string)
hashed = hash(view)
word.string_views[i] = hashed
self.bacov[hashed] = view
self.bacov[word.lex] = string self.bacov[word.lex] = string
self.vocab[word.lex] = <LexID>word self.vocab[word.lex] = <LexID>word
return word return word
def add_view_funcs(self, list view_funcs):
self.view_funcs.extend(view_funcs)
cdef size_t nr_views = len(self.view_funcs)
cdef unicode view
cdef StringHash hashed
cdef StringHash key
cdef unicode string
cdef LexID lex_id
cdef Lexeme* word
for key, lex_id in self.vocab:
word = <Lexeme*>lex_id
free(word.string_views)
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
string = word.string[:word.length].decode('utf8')
for i, view_func in enumerate(self.view_funcs):
view = view_func(string)
hashed = hash(view)
word.string_views[i] = hashed
self.bacov[hashed] = view
cpdef unicode unhash(self, StringHash hash_value): cpdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value] return self.bacov[hash_value]

View File

@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
@ -66,8 +65,7 @@ cdef class Tokens:
cdef StringHash key cdef StringHash key
cdef Lexeme_addr t cdef Lexeme_addr t
for t in self.vctr[0]: for t in self.vctr[0]:
#key = attr_of(t, attr) key = self.lang.attr_of(t, attr)
key = 0
if key in indices: if key in indices:
groups[indices[key]].append(t) groups[indices[key]].append(t)
else: else: