From 7c09c73a14e55f9eedda0d7664deeabe53474b0e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Sep 2014 18:27:44 +0200 Subject: [PATCH] * Refactor to use tokens class. --- spacy/en.pyx | 5 ++++- spacy/lang.pyx | 6 +++--- spacy/orth.py | 4 ++-- tests/test_asciify.py | 8 ++++---- tests/test_orth.py | 6 ++---- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index 497c9e350..62e195ca8 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -218,6 +218,9 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ + fl_is_alpha = Flag_IsAlpha + fl_is_digit = Flag_IsDigit + v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): self.cache = {} lang_data = util.read_lang_data(name) @@ -226,7 +229,7 @@ cdef class English(Language): STRING_VIEW_FUNCS + user_string_features, FLAG_FUNCS + user_flag_features) self._load_special_tokenization(rules) - self.token_class = EnglishTokens + self.tokens_class = EnglishTokens cdef int _split_one(self, unicode word): cdef size_t length = len(word) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 3d02b7677..b3d6dcd0e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -43,7 +43,7 @@ cdef class Language: self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, string_features, flag_features) self._load_special_tokenization(rules) - self.token_class = Tokens + self.tokens_class = Tokens property nr_types: def __get__(self): @@ -81,7 +81,7 @@ cdef class Language: cdef size_t length = len(string) cdef size_t start = 0 cdef size_t i = 0 - cdef Tokens tokens = self.token_class() + cdef Tokens tokens = self.tokens_class() for c in string: if c == ' ': if start < i: @@ -91,7 +91,7 @@ cdef class Language: if start < i: self._tokenize(tokens, string[start:i]) assert tokens - return tokens + return tokens.lexemes cdef _tokenize(self, Tokens tokens, unicode string): cdef list lexemes diff --git a/spacy/orth.py b/spacy/orth.py index b7106d609..685de191c 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -56,7 +56,7 @@ def oft_case(name, thresh): return wrapped -def can_tag(name, thresh): +def can_tag(name, thresh=0.5): def wrapped(string, prob, case_stats, tag_stats): return string return wrapped @@ -111,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats): return word_shape(string, prob, cluster, case_stats, tag_stats) -def asciied(string): +def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None): '''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.''' # Snippet from # http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html diff --git a/tests/test_asciify.py b/tests/test_asciify.py index eed71a5f3..d03af0d25 100644 --- a/tests/test_asciify.py +++ b/tests/test_asciify.py @@ -3,16 +3,16 @@ from __future__ import unicode_literals import pytest -from spacy.orth import asciify +from spacy.orth import asciied def test_tilde(): string = u'hõmbre' - assert asciify(string) == u'hombre' + assert asciied(string) == u'hombre' def test_smart_quote(): string = u'“' - assert asciify(string) == '"' + assert asciied(string) == '"' string = u'”' - assert asciify(string) == '"' + assert asciied(string) == '"' diff --git a/tests/test_orth.py b/tests/test_orth.py index 33cd4014a..0840af683 100644 --- a/tests/test_orth.py +++ b/tests/test_orth.py @@ -3,9 +3,7 @@ from __future__ import unicode_literals import pytest import spacy.word -from spacy import en - -EN = en.EN +from spacy.en import EN @pytest.fixture @@ -14,7 +12,7 @@ def C3P0(): def test_shape(C3P0): - assert C3P0.string_view(en.SHAPE) == "XdXd" + assert C3P0.string_view(EN.v_shape) == "XdXd" def test_length():