* Refactor to use tokens class.

2025-11-01 08:27:44 +03:00 · 2014-09-10 18:27:44 +02:00 · 2014-09-10 18:27:44 +02:00 · 7c09c73a14
commit 7c09c73a14
parent cf412adba8
5 changed files with 15 additions and 14 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -218,6 +218,9 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
+    fl_is_alpha = Flag_IsAlpha
+    fl_is_digit = Flag_IsDigit
+    v_shape = View_WordShape
    def __cinit__(self, name, user_string_features, user_flag_features):
        self.cache = {}
        lang_data = util.read_lang_data(name)
@ -226,7 +229,7 @@ cdef class English(Language):
                                     STRING_VIEW_FUNCS + user_string_features,
                                     FLAG_FUNCS + user_flag_features)
        self._load_special_tokenization(rules)
-        self.token_class = EnglishTokens
+        self.tokens_class = EnglishTokens

    cdef int _split_one(self, unicode word):
        cdef size_t length = len(word)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -43,7 +43,7 @@ cdef class Language:
        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
                               string_features, flag_features)
        self._load_special_tokenization(rules)
-        self.token_class = Tokens
+        self.tokens_class = Tokens

    property nr_types:
        def __get__(self):
@ -81,7 +81,7 @@ cdef class Language:
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
-        cdef Tokens tokens = self.token_class()
+        cdef Tokens tokens = self.tokens_class()
        for c in string:
            if c == ' ':
                if start < i:
@ -91,7 +91,7 @@ cdef class Language:
        if start < i:
            self._tokenize(tokens, string[start:i])
        assert tokens
-        return tokens
+        return tokens.lexemes

    cdef _tokenize(self, Tokens tokens, unicode string):
        cdef list lexemes
--- a/spacy/orth.py
+++ b/spacy/orth.py
@ -56,7 +56,7 @@ def oft_case(name, thresh):
    return wrapped


-def can_tag(name, thresh):
+def can_tag(name, thresh=0.5):
    def wrapped(string, prob, case_stats, tag_stats):
        return string
    return wrapped
@ -111,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats):
        return word_shape(string, prob, cluster, case_stats, tag_stats)


-def asciied(string):
+def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
    '''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.''' 
    # Snippet from
    # http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
--- a/tests/test_asciify.py
+++ b/tests/test_asciify.py
@ -3,16 +3,16 @@
 from __future__ import unicode_literals
 import pytest

-from spacy.orth import asciify
+from spacy.orth import asciied


 def test_tilde():
    string = u'hõmbre'
-    assert asciify(string) == u'hombre'
+    assert asciied(string) == u'hombre'


 def test_smart_quote():
    string = u'“'
-    assert asciify(string) == '"'
+    assert asciied(string) == '"'
    string = u'”'
-    assert asciify(string) == '"'
+    assert asciied(string) == '"'
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -3,9 +3,7 @@ from __future__ import unicode_literals
 import pytest

 import spacy.word
-from spacy import en
-
-EN = en.EN
+from spacy.en import EN


@pytest.fixture
@ -14,7 +12,7 @@ def C3P0():


 def test_shape(C3P0):
-    assert C3P0.string_view(en.SHAPE) == "XdXd"
+    assert C3P0.string_view(EN.v_shape) == "XdXd"


 def test_length():