From 8bbfadfced722dded923aa2478684737bd0ca86c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 30 Aug 2014 20:36:06 +0200 Subject: [PATCH] * Pass tests. Need to implement more feature functions. --- spacy/orth.py | 3 +++ spacy/word.pxd | 2 +- spacy/word.pyx | 1 + tests/test_vocab.py | 31 ++++++++++++++----------------- tests/test_wiki_sun.py | 6 ++---- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/spacy/orth.py b/spacy/orth.py index c574006c8..847d7eae7 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + # Binary string features def is_alpha(string, prob, case_stats, tag_stats): return False @@ -41,6 +43,7 @@ def can_tag(name, thresh): def canon_case(string, prob, cluster, case_stats, tag_stats): return string + def word_shape(string, *args): length = len(string) shape = "" diff --git a/spacy/word.pxd b/spacy/word.pxd index 90c9c941a..c382d91ea 100644 --- a/spacy/word.pxd +++ b/spacy/word.pxd @@ -11,7 +11,7 @@ cdef class Lexeme: cpdef readonly double prob cpdef readonly size_t cluster - cpdef readonly string + cpdef readonly unicode string cpdef readonly list views cdef readonly flag_t flags diff --git a/spacy/word.pyx b/spacy/word.pyx index 9427e3397..8824d8a89 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -54,6 +54,7 @@ cdef class Lexeme: self.string = string self.views = [] + cdef unicode view for string_feature in string_features: view = string_feature(string, prob, cluster, case_stats, tag_stats) self.views.append(view) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 6128e728a..706a7ee07 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -1,37 +1,34 @@ from __future__ import unicode_literals -from spacy import lex_of -from spacy.en import lookup -from spacy.en import unhash +from spacy.en import EN def test_neq(): - addr = lookup('Hello') - assert lookup('bye') != addr + addr = EN.lookup('Hello') + assert EN.lookup('bye') != addr def test_eq(): - addr = lookup('Hello') - assert lookup('Hello') == addr + addr = EN.lookup('Hello') + assert EN.lookup('Hello') == addr def test_round_trip(): - hello = lookup('Hello') - assert unhash(hello.lex) == 'Hello' + hello = EN.lookup('Hello') + assert hello.string == 'Hello' def test_case_neq(): - addr = lookup('Hello') - assert lookup('hello') != addr + addr = EN.lookup('Hello') + assert EN.lookup('hello') != addr def test_punct_neq(): - addr = lookup('Hello') - assert lookup('Hello,') != addr + addr = EN.lookup('Hello') + assert EN.lookup('Hello,') != addr def test_short(): - addr = lookup('I') - assert unhash(addr.lex) == 'I' - addr = lookup('not') - assert unhash(addr.lex) == 'not' + addr = EN.lookup('I') + assert addr.string == 'I' + assert addr.string != 'not' diff --git a/tests/test_wiki_sun.py b/tests/test_wiki_sun.py index 1329bdffc..75194b7f5 100644 --- a/tests/test_wiki_sun.py +++ b/tests/test_wiki_sun.py @@ -1,8 +1,6 @@ from __future__ import unicode_literals -from spacy.en import unhash -from spacy import lex_of -from spacy import en +from spacy.en import EN from spacy.util import utf8open import pytest @@ -21,5 +19,5 @@ def sun_txt(): def test_tokenize(sun_txt): assert len(sun_txt) != 0 - tokens = en.tokenize(sun_txt) + tokens = EN.tokenize(sun_txt) assert True