* Begin testing more functionality

2025-11-08 03:47:39 +03:00 · 2014-08-30 19:01:15 +02:00 · 2014-08-30 19:01:15 +02:00 · dcab14ede2
commit dcab14ede2
parent 3e3ff99ca0
5 changed files with 36 additions and 32 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -42,6 +42,8 @@ from libc.stdint cimport uint64_t

 cimport lang

+from spacy import util
+
 from spacy import orth

 TAG_THRESH = 0.5
@ -78,6 +80,11 @@ CAN_POS = NR_FLAGS; NR_FLAGS += 1
 CAN_PRON = NR_FLAGS; NR_FLAGS += 1
 CAN_PRT = NR_FLAGS; NR_FLAGS += 1

+NR_VIEWS = 0
+CANON_CASED = NR_VIEWS; NR_VIEWS += 1
+SHAPE = NR_VIEWS; NR_VIEWS += 1
+NON_SPARSE = NR_VIEWS; NR_VIEWS += 1
+

 cdef class English(Language):
    """English tokenizer, tightly coupled to lexicon.
@ -87,8 +94,8 @@ cdef class English(Language):
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """

-    def __cinit__(self, name):
-        flag_funcs = [0 for _ in range(NR_FLAGS)]
+    def __cinit__(self, name, string_features, flag_features):
+        flag_funcs = [None for _ in range(NR_FLAGS)]
        
        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
@ -98,6 +105,7 @@ cdef class English(Language):
        flag_funcs[IS_DIGIT] = orth.is_digit
        flag_funcs[IS_PUNCT] = orth.is_punct
        flag_funcs[IS_SPACE] = orth.is_space
+        flag_funcs[IS_ASCII] = orth.is_ascii
        flag_funcs[IS_TITLE] = orth.is_title
        flag_funcs[IS_LOWER] = orth.is_lower
        flag_funcs[IS_UPPER] = orth.is_upper
@ -108,13 +116,25 @@ cdef class English(Language):
        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
+        flag_funcs[CAN_ADV] = orth.can_tag('ADV', TAG_THRESH)
        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
+        flag_funcs[CAN_PRON] = orth.can_tag('PRON', TAG_THRESH)
        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
-        
-        Language.__init__(self, name, flag_funcs)
+
+        string_funcs = [None for _ in range(NR_VIEWS)]
+        string_funcs[CANON_CASED] = orth.canon_case
+        string_funcs[SHAPE] = orth.word_shape
+        string_funcs[NON_SPARSE] = orth.non_sparse
+        self.name = name
+        self.cache = {}
+        lang_data = util.read_lang_data(name)
+        rules, words, probs, clusters, case_stats, tag_stats = lang_data
+        self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
+                                     string_funcs, flag_funcs)
+        self._load_special_tokenization(rules)

    cdef int _split_one(self, unicode word):
        cdef size_t length = len(word)
@ -149,4 +169,4 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):
    return not word[i].isalnum()


-EN = English('en')
+EN = English('en', [], [])
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -30,7 +30,7 @@ cdef class Language:

    The language's name is used to look up default data-files, found in data/<name.
    """
-    def __cinit__(self, name, string_features=None, flag_features=None):
+    def __cinit__(self, name, string_features, flag_features):
        if flag_features is None:
            flag_features = []
        if string_features is None:
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -53,6 +53,7 @@ cdef class Lexeme:
        self.length = len(string)
        self.string = string

+        self.views = []
        for string_feature in string_features:
            view = string_feature(string, prob, cluster, case_stats, tag_stats)
            self.views.append(view)
--- a/tests/test_hash_table.py
+++ b/tests/test_hash_table.py
@ -1,18 +0,0 @@
-import pytest
-
-from spacy._hashing import FixedTable
-
-
-def test_insert():
-    table = FixedTable(20)
-    table[5] = 10
-    assert table.bucket(5) == 5
-    assert table[4] == 0
-    assert table[5] == 10
-
-def test_clobber():
-    table = FixedTable(10)
-    table[9] = 1
-    assert table.bucket(9) == 9
-    assert table.bucket(19) == 9
-
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -2,26 +2,27 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en import lookup, unhash
 import spacy.word
+from spacy import en
+
+EN = en.EN


@pytest.fixture
 def C3P0():
-    return lookup("C3P0")
+    return EN.lookup("C3P0")


 def test_shape(C3P0):
-    # TODO: Fix this
-    assert unhash(C3P0.get_view(2)) == "XdXd"
+    assert C3P0.string_view(en.SHAPE) == "XdXd"


 def test_length():
-    t = lookup('the')
+    t = EN.lookup('the')
    assert t.length == 3
-    t = lookup("n't")
+    t = EN.lookup("n't")
    assert t.length == 3
-    t = lookup("'s")
+    t = EN.lookup("'s")
    assert t.length == 2
-    t = lookup('Xxxx')
+    t = EN.lookup('Xxxx')
    assert t.length == 4