* Struggling with arbitrary attr access...

This commit is contained in:
Matthew Honnibal 2014-08-21 23:49:14 +02:00
parent 314658b31c
commit 811b7a6b91
12 changed files with 162 additions and 89 deletions

View File

@ -1,5 +0,0 @@
Cython API
==========
Cheat Sheet
-----------

View File

@ -1,2 +0,0 @@
Adding a Language
=================

View File

@ -1,45 +0,0 @@
Python API
==========
.. py:currentmodule:: spacy.en
To and from unicode strings
---------------------------
.. autofunction:: tokenize
.. autofunction:: lookup
.. autofunction:: unhash
Access (Hashed) String Views
----------------------------
.. autofunction:: lex_of
.. autofunction:: norm_of
.. autofunction:: shape_of
.. autofunction:: last3_of
Access String Properties
------------------------
.. autofunction:: length_of
.. autofunction:: first_of
Check Orthographic Flags
-------------------------
.. autofunction:: is_alpha
.. autofunction:: is_digit
.. autofunction:: is_punct
.. autofunction:: is_space
.. autofunction:: is_lower
.. autofunction:: is_upper
.. autofunction:: is_title
.. autofunction:: is_ascii
Access Distributional Information
---------------------------------
.. autofunction:: prob_of
.. autofunction:: cluster_of
.. autofunction:: check_tag_flag
.. autofunction:: check_dist_flag

View File

View File

View File

@ -1,19 +1,39 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Lexeme
from spacy.spacy cimport Lexeme_addr
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport LexID
from spacy.lexeme cimport ClusterID
from spacy.spacy cimport Language
from spacy.tokens cimport Tokens
cimport cython
ctypedef fused AttrType:
ClusterID
StringHash
cython.char
cdef enum AttrName:
LEX
FIRST
LENGTH
CLUSTER
NORM
SHAPE
LAST3
cdef class English(spacy.Language):
cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *
cdef English EN
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef LexID lookup(unicode word) except 0
cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -26,10 +26,8 @@ scheme in several important respects:
Take care to ensure you training and run-time data is tokenized according to the
same scheme. Tokenization problems are a major cause of poor performance for
NLP tools.
If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
Treebank 3-compliant tokenizer.
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer.
'''
#The script translate_treebank_tokenization can be used to transform a treebank's
#annotation to use one of the spacy tokenization schemes.
@ -53,8 +51,12 @@ from .lexeme import *
cdef class English(spacy.Language):
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
pass
# How to ensure the order here aligns with orthography.latin?
view_funcs = [
get_normalized,
get_word_shape,
get_last3
]
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
@ -74,6 +76,27 @@ cdef class English(spacy.Language):
i += 1
return i
cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *:
cdef Lexeme* w = <Lexeme*>lex_id
if attr == LEX:
return <AttrType>w.lex
elif attr == FIRST:
return w.string[0]
elif attr == LENGTH:
return w.length
elif attr == CLUSTER:
return w.cluster
elif attr == NORM:
return w.string_views[0]
elif attr == SHAPE:
return w.string_views[1]
elif attr == LAST3:
return w.string_views[2]
else:
raise AttributeError(attr)
cdef bint check_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter
@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string):
return EN.tokenize(string)
# +49 151 4336 2587
cpdef LexID lookup(unicode string) except 0:
"""Retrieve (or create, if not found) a Lexeme ID for a string.
@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0:
Returns:
lexeme (LexID): A reference to a lexical type.
"""
return <Lexeme_addr>EN.lookup(string)
return <LexID>EN.lookup(string)
cpdef unicode unhash(StringHash hash_value):
@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value):
string (unicode): A unicode string that hashes to the hash_value.
"""
return EN.unhash(hash_value)
def add_string_views(view_funcs):
"""Add a string view to existing and previous lexical entries.
Args:
get_view (function): A unicode --> unicode function.
Returns:
view_id (int): An integer key you can use to access the view.
"""
pass
def load_clusters(location):
"""Load cluster data.
"""
pass
def load_unigram_probs(location):
"""Load unigram probabilities.
"""
pass
def load_case_stats(location):
"""Load case stats.
"""
pass
def load_tag_stats(location):
"""Load tag statistics.
"""
pass

View File

@ -1,6 +1,6 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
cimport cython
ctypedef int ClusterID
ctypedef uint32_t StringHash

View File

@ -10,10 +10,9 @@ cdef enum OrthFlag:
cdef enum:
LEX
LAST3
NORM
SHAPE
LAST3
from spacy.lexeme cimport LexID
from spacy.lexeme cimport StringHash

View File

@ -1,20 +1,38 @@
# cython: embedsignature=True
from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme
def get_normalized(unicode lex):
if lex.isalpha() and lex.islower():
return lex
def get_normalized(unicode word):
"""Todo.
Args:
word (unicode)
Returns:
normalized (unicode)
"""
if word.isalpha() and word.islower():
return word
else:
return get_word_shape(lex)
return get_word_shape(word)
def get_word_shape(unicode lex):
cdef size_t length = len(lex)
def get_word_shape(unicode word):
"""Todo.
Args:
word (unicode)
Returns:
shape (unicode)
"""
cdef size_t length = len(word)
shape = ""
last = ""
shape_char = ""
seq = 0
for c in lex:
for c in word:
if c.isalpha():
if c.isupper():
shape_char = "X"
@ -35,8 +53,14 @@ def get_word_shape(unicode lex):
return shape
cpdef unicode get_last3(unicode string):
return string[-3:]
cpdef bint is_alpha(LexID lex_id) except *:
"""Give the result of unicode.isalpha() for a Lexeme ID.
"""Check whether all characters in the word's string are alphabetic.
Should match the :py:func:`unicode.isalpha()` function.
>>> is_alpha(lookup(u'Hello'))
True
@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *:
cpdef bint is_digit(LexID lex_id) except *:
"""Give the result of unicode.isdigit() for a Lexeme ID.
"""Check whether all characters in the word's string are numeric.
Should match the :py:func:`unicode.isdigit()` function.
>>> is_digit(lookup(u'10'))
True
@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *:
cpdef bint is_punct(LexID lex_id) except *:
"""Give the result of checking whether all characters belong to a punctuation
unicode data category for a Lexeme ID.
"""Check whether all characters belong to a punctuation unicode data category
for a Lexeme ID.
>>> is_punct(lookup(u'.'))
True
@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *:
cpdef bint is_space(LexID lex_id) except *:
"""Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\t'))
>>> is_space(lookup(u'\\t'))
True
>>> is_space(lookup(u'<unicode space>'))
True
>>> is_space(lookup(u'Hi\n'))
>>> is_space(lookup(u'Hi\\n'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
"""Return the hash of a "normalized" version of the string.
Normalized strings are intended to be less sparse, while still capturing
important lexical information. See spacy.latin.orthography.normalize_string for details of the normalization
function.
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
for details of the normalization function.
>>> unhash(norm_of(lookupu'Hi'))
u'hi'
@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0:
cpdef StringHash shape_of(LexID lex_id) except 0:
"""Return the hash of a string describing the word's "orthograpgic shape".
Orthographic shapes are calculated by the spacy.orthography.latin.string_shape
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
function. Word shape features have been found useful for NER and POS tagging,
e.g. Manning (2011)

View File

@ -24,6 +24,7 @@ TAGS = {}
DIST_FLAGS = {}
cdef class Language:
view_funcs = []
def __cinit__(self, name):
self.name = name
self.bacov = {}
@ -90,13 +91,41 @@ cdef class Language:
cdef bytes byte_string = string.encode('utf8')
word.string = <char*>byte_string
word.length = len(byte_string)
self.set_orth(string, word)
word.lex = hash(string)
word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
cdef unicode view
cdef StringHash hashed
for i, view_func in enumerate(self.view_funcs):
view = view_func(string)
hashed = hash(view)
word.string_views[i] = hashed
self.bacov[hashed] = view
self.bacov[word.lex] = string
self.vocab[word.lex] = <LexID>word
return word
def add_view_funcs(self, list view_funcs):
self.view_funcs.extend(view_funcs)
cdef size_t nr_views = len(self.view_funcs)
cdef unicode view
cdef StringHash hashed
cdef StringHash key
cdef unicode string
cdef LexID lex_id
cdef Lexeme* word
for key, lex_id in self.vocab:
word = <Lexeme*>lex_id
free(word.string_views)
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
string = word.string[:word.length].decode('utf8')
for i, view_func in enumerate(self.view_funcs):
view = view_func(string)
hashed = hash(view)
word.string_views[i] = hashed
self.bacov[hashed] = view
cpdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]

View File

@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
from spacy.spacy cimport StringHash
@ -66,8 +65,7 @@ cdef class Tokens:
cdef StringHash key
cdef Lexeme_addr t
for t in self.vctr[0]:
#key = attr_of(t, attr)
key = 0
key = self.lang.attr_of(t, attr)
if key in indices:
groups[indices[key]].append(t)
else: