From 80b36f9f2755715d8e9bf7abffd7d244124bb1b0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 19:12:19 +0200 Subject: [PATCH] * 710k words per second for counts --- spacy/lexeme.pxd | 1 + spacy/lexeme.pyx | 9 ++++++--- spacy/spacy.pyx | 25 ++++++++++++++++++++++++- spacy/tokens.pyx | 5 ++--- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 783329981..041bdcc47 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL) cpdef StringHash lex_of(size_t lex_id) except 0 cpdef StringHash norm_of(size_t lex_id) except 0 +cpdef StringHash shape_of(size_t lex_id) except 0 #cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed, # int split, size_t length) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5d83cfeee..551b88442 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0: return (lex_id).orth.norm +cpdef StringHash shape_of(size_t lex_id) except 0: + return (lex_id).orth.shape + + cpdef ClusterID cluster_of(size_t lex_id): '''Access the `cluster' field of the Lexeme pointed to by lex_id, which gives an integer representation of the cluster ID of the word, @@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id): >>> prob_of(lookup(u'world')) -20.10340371976182 ''' - pass + return (lex_id).dist.prob cpdef StringHash last3_of(size_t lex_id): @@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id): >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] [u'llo', u'!'] ''' - cdef Lexeme* w = lex_id - return w.orth.last3 if w.orth != NULL else 0 + return (lex_id).orth.last3 cpdef bint is_oft_upper(size_t lex_id): diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 4c4480ca1..b942e6f0f 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length): def get_word_shape(lex, length): - return lex + shape = "" + last = "" + shape_char = "" + seq = 0 + for c in lex: + if c.isalpha(): + if c.isupper(): + shape_char = "X" + else: + shape_char = "x" + elif c.isdigit(): + shape_char = "d" + else: + shape_char = c + if shape_char == last: + seq += 1 + else: + seq = 0 + last = shape_char + if seq < 3: + shape += shape_char + assert shape + return shape + def set_orth_flags(lex, length): diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 61229e500..9d40ceb26 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport norm_of +from spacy.lexeme cimport norm_of, shape_of from spacy.spacy cimport StringHash @@ -45,9 +45,8 @@ cdef class Tokens: cdef Lexeme_addr t cdef StringHash key for t in self.vctr[0]: - key = norm_of(t) + key = (t).lex if key not in counts: counts[key] = 0 counts[key] += 1 return counts -