From 811b7a6b91c465201f1834d6b225f3df8c475c3b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 21 Aug 2014 23:49:14 +0200 Subject: [PATCH] * Struggling with arbitrary attr access... --- docs/api/cython.rst | 5 --- docs/api/extending.rst | 2 - docs/api/python.rst | 45 ----------------------- docs/guide/intro.rst | 0 docs/tutorial.rst | 0 spacy/en.pxd | 26 +++++++++++-- spacy/en.pyx | 73 ++++++++++++++++++++++++++++++++----- spacy/lexeme.pxd | 2 +- spacy/orthography/latin.pxd | 3 +- spacy/orthography/latin.pyx | 58 +++++++++++++++++++++-------- spacy/spacy.pyx | 33 ++++++++++++++++- spacy/tokens.pyx | 4 +- 12 files changed, 162 insertions(+), 89 deletions(-) delete mode 100644 docs/api/cython.rst delete mode 100644 docs/api/extending.rst delete mode 100644 docs/api/python.rst delete mode 100644 docs/guide/intro.rst delete mode 100644 docs/tutorial.rst diff --git a/docs/api/cython.rst b/docs/api/cython.rst deleted file mode 100644 index 71497061e..000000000 --- a/docs/api/cython.rst +++ /dev/null @@ -1,5 +0,0 @@ -Cython API -========== - -Cheat Sheet ------------ diff --git a/docs/api/extending.rst b/docs/api/extending.rst deleted file mode 100644 index a35109ce8..000000000 --- a/docs/api/extending.rst +++ /dev/null @@ -1,2 +0,0 @@ -Adding a Language -================= diff --git a/docs/api/python.rst b/docs/api/python.rst deleted file mode 100644 index 95b9a7f55..000000000 --- a/docs/api/python.rst +++ /dev/null @@ -1,45 +0,0 @@ -Python API -========== - -.. py:currentmodule:: spacy.en - -To and from unicode strings ---------------------------- - -.. autofunction:: tokenize -.. autofunction:: lookup -.. autofunction:: unhash - -Access (Hashed) String Views ----------------------------- - -.. autofunction:: lex_of -.. autofunction:: norm_of -.. autofunction:: shape_of -.. autofunction:: last3_of - -Access String Properties ------------------------- - -.. autofunction:: length_of -.. autofunction:: first_of - -Check Orthographic Flags -------------------------- - -.. autofunction:: is_alpha -.. autofunction:: is_digit -.. autofunction:: is_punct -.. autofunction:: is_space -.. autofunction:: is_lower -.. autofunction:: is_upper -.. autofunction:: is_title -.. autofunction:: is_ascii - -Access Distributional Information ---------------------------------- - -.. autofunction:: prob_of -.. autofunction:: cluster_of -.. autofunction:: check_tag_flag -.. autofunction:: check_dist_flag diff --git a/docs/guide/intro.rst b/docs/guide/intro.rst deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/tutorial.rst b/docs/tutorial.rst deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/en.pxd b/spacy/en.pxd index ed08a144d..2b42ac07e 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,19 +1,39 @@ from libcpp.vector cimport vector from spacy.spacy cimport StringHash -from spacy.spacy cimport Lexeme -from spacy.spacy cimport Lexeme_addr +from spacy.lexeme cimport Lexeme +from spacy.lexeme cimport LexID +from spacy.lexeme cimport ClusterID from spacy.spacy cimport Language from spacy.tokens cimport Tokens +cimport cython + + +ctypedef fused AttrType: + ClusterID + StringHash + cython.char + + +cdef enum AttrName: + LEX + FIRST + LENGTH + CLUSTER + NORM + SHAPE + LAST3 + cdef class English(spacy.Language): cdef int find_split(self, unicode word) cdef int set_orth(self, unicode word, Lexeme* lex) except -1 + cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except * cdef English EN -cpdef Lexeme_addr lookup(unicode word) except 0 +cpdef LexID lookup(unicode word) except 0 cpdef Tokens tokenize(unicode string) cpdef unicode unhash(StringHash hash_value) diff --git a/spacy/en.pyx b/spacy/en.pyx index af137fb8a..812b5e729 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -26,10 +26,8 @@ scheme in several important respects: Take care to ensure you training and run-time data is tokenized according to the same scheme. Tokenization problems are a major cause of poor performance for -NLP tools. - -If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn -Treebank 3-compliant tokenizer. +NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module +provides a fully Penn Treebank 3-compliant tokenizer. ''' #The script translate_treebank_tokenization can be used to transform a treebank's #annotation to use one of the spacy tokenization schemes. @@ -53,8 +51,12 @@ from .lexeme import * cdef class English(spacy.Language): - cdef int set_orth(self, unicode word, Lexeme* lex) except -1: - pass + # How to ensure the order here aligns with orthography.latin? + view_funcs = [ + get_normalized, + get_word_shape, + get_last3 + ] cdef int find_split(self, unicode word): cdef size_t length = len(word) @@ -74,6 +76,27 @@ cdef class English(spacy.Language): i += 1 return i + cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *: + cdef Lexeme* w = lex_id + if attr == LEX: + return w.lex + elif attr == FIRST: + return w.string[0] + elif attr == LENGTH: + return w.length + elif attr == CLUSTER: + return w.cluster + elif attr == NORM: + return w.string_views[0] + elif attr == SHAPE: + return w.string_views[1] + elif attr == LAST3: + return w.string_views[2] + else: + raise AttributeError(attr) + + + cdef bint check_punct(unicode word, size_t i, size_t length): # Don't count appostrophes as punct if the next char is a letter @@ -110,9 +133,6 @@ cpdef Tokens tokenize(unicode string): return EN.tokenize(string) -# +49 151 4336 2587 - - cpdef LexID lookup(unicode string) except 0: """Retrieve (or create, if not found) a Lexeme ID for a string. @@ -124,7 +144,7 @@ cpdef LexID lookup(unicode string) except 0: Returns: lexeme (LexID): A reference to a lexical type. """ - return EN.lookup(string) + return EN.lookup(string) cpdef unicode unhash(StringHash hash_value): @@ -142,3 +162,36 @@ cpdef unicode unhash(StringHash hash_value): string (unicode): A unicode string that hashes to the hash_value. """ return EN.unhash(hash_value) + + +def add_string_views(view_funcs): + """Add a string view to existing and previous lexical entries. + + Args: + get_view (function): A unicode --> unicode function. + + Returns: + view_id (int): An integer key you can use to access the view. + """ + pass + + +def load_clusters(location): + """Load cluster data. + """ + pass + +def load_unigram_probs(location): + """Load unigram probabilities. + """ + pass + +def load_case_stats(location): + """Load case stats. + """ + pass + +def load_tag_stats(location): + """Load tag statistics. + """ + pass diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f6836af56..d4af5b0bc 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,6 +1,6 @@ from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t - +cimport cython ctypedef int ClusterID ctypedef uint32_t StringHash diff --git a/spacy/orthography/latin.pxd b/spacy/orthography/latin.pxd index 9c3e853ec..24c035dc9 100644 --- a/spacy/orthography/latin.pxd +++ b/spacy/orthography/latin.pxd @@ -10,10 +10,9 @@ cdef enum OrthFlag: cdef enum: - LEX - LAST3 NORM SHAPE + LAST3 from spacy.lexeme cimport LexID from spacy.lexeme cimport StringHash diff --git a/spacy/orthography/latin.pyx b/spacy/orthography/latin.pyx index 328e73d8c..6e4434e5d 100644 --- a/spacy/orthography/latin.pyx +++ b/spacy/orthography/latin.pyx @@ -1,20 +1,38 @@ # cython: embedsignature=True +from __future__ import unicode_literals + from spacy.lexeme cimport Lexeme -def get_normalized(unicode lex): - if lex.isalpha() and lex.islower(): - return lex +def get_normalized(unicode word): + """Todo. + + Args: + word (unicode) + + Returns: + normalized (unicode) + """ + if word.isalpha() and word.islower(): + return word else: - return get_word_shape(lex) + return get_word_shape(word) -def get_word_shape(unicode lex): - cdef size_t length = len(lex) +def get_word_shape(unicode word): + """Todo. + + Args: + word (unicode) + + Returns: + shape (unicode) + """ + cdef size_t length = len(word) shape = "" last = "" shape_char = "" seq = 0 - for c in lex: + for c in word: if c.isalpha(): if c.isupper(): shape_char = "X" @@ -35,8 +53,14 @@ def get_word_shape(unicode lex): return shape +cpdef unicode get_last3(unicode string): + return string[-3:] + + cpdef bint is_alpha(LexID lex_id) except *: - """Give the result of unicode.isalpha() for a Lexeme ID. + """Check whether all characters in the word's string are alphabetic. + + Should match the :py:func:`unicode.isalpha()` function. >>> is_alpha(lookup(u'Hello')) True @@ -49,7 +73,9 @@ cpdef bint is_alpha(LexID lex_id) except *: cpdef bint is_digit(LexID lex_id) except *: - """Give the result of unicode.isdigit() for a Lexeme ID. + """Check whether all characters in the word's string are numeric. + + Should match the :py:func:`unicode.isdigit()` function. >>> is_digit(lookup(u'10')) True @@ -62,8 +88,8 @@ cpdef bint is_digit(LexID lex_id) except *: cpdef bint is_punct(LexID lex_id) except *: - """Give the result of checking whether all characters belong to a punctuation - unicode data category for a Lexeme ID. + """Check whether all characters belong to a punctuation unicode data category + for a Lexeme ID. >>> is_punct(lookup(u'.')) True @@ -78,11 +104,11 @@ cpdef bint is_punct(LexID lex_id) except *: cpdef bint is_space(LexID lex_id) except *: """Give the result of unicode.isspace() for a Lexeme ID. - >>> is_space(lookup(u'\t')) + >>> is_space(lookup(u'\\t')) True >>> is_space(lookup(u'')) True - >>> is_space(lookup(u'Hi\n')) + >>> is_space(lookup(u'Hi\\n')) False """ return (lex_id).orth_flags & 1 << IS_SPACE @@ -144,8 +170,8 @@ cpdef StringHash norm_of(LexID lex_id) except 0: """Return the hash of a "normalized" version of the string. Normalized strings are intended to be less sparse, while still capturing - important lexical information. See spacy.latin.orthography.normalize_string for details of the normalization - function. + important lexical information. See :py:func:`spacy.latin.orthography.normalize_string` + for details of the normalization function. >>> unhash(norm_of(lookupu'Hi')) u'hi' @@ -160,7 +186,7 @@ cpdef StringHash norm_of(LexID lex_id) except 0: cpdef StringHash shape_of(LexID lex_id) except 0: """Return the hash of a string describing the word's "orthograpgic shape". - Orthographic shapes are calculated by the spacy.orthography.latin.string_shape + Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape` function. Word shape features have been found useful for NER and POS tagging, e.g. Manning (2011) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index addb76b39..7f54b1225 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -24,6 +24,7 @@ TAGS = {} DIST_FLAGS = {} cdef class Language: + view_funcs = [] def __cinit__(self, name): self.name = name self.bacov = {} @@ -90,13 +91,41 @@ cdef class Language: cdef bytes byte_string = string.encode('utf8') word.string = byte_string word.length = len(byte_string) - self.set_orth(string, word) - word.lex = hash(string) + word.string_views = calloc(len(self.view_funcs), sizeof(StringHash)) + cdef unicode view + cdef StringHash hashed + for i, view_func in enumerate(self.view_funcs): + view = view_func(string) + hashed = hash(view) + word.string_views[i] = hashed + self.bacov[hashed] = view self.bacov[word.lex] = string self.vocab[word.lex] = word return word + def add_view_funcs(self, list view_funcs): + self.view_funcs.extend(view_funcs) + cdef size_t nr_views = len(self.view_funcs) + + cdef unicode view + cdef StringHash hashed + cdef StringHash key + cdef unicode string + cdef LexID lex_id + cdef Lexeme* word + + for key, lex_id in self.vocab: + word = lex_id + free(word.string_views) + word.string_views = calloc(nr_views, sizeof(StringHash)) + string = word.string[:word.length].decode('utf8') + for i, view_func in enumerate(self.view_funcs): + view = view_func(string) + hashed = hash(view) + word.string_views[i] = hashed + self.bacov[hashed] = view + cpdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' return self.bacov[hash_value] diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 9aaf08106..97cb7fb99 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -3,7 +3,6 @@ from cython.operator cimport preincrement as inc from spacy.lexeme cimport Lexeme -#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of from spacy.spacy cimport StringHash @@ -66,8 +65,7 @@ cdef class Tokens: cdef StringHash key cdef Lexeme_addr t for t in self.vctr[0]: - #key = attr_of(t, attr) - key = 0 + key = self.lang.attr_of(t, attr) if key in indices: groups[indices[key]].append(t) else: