* Begin testing more functionality

This commit is contained in:
Matthew Honnibal 2014-08-30 19:01:15 +02:00
parent 3e3ff99ca0
commit dcab14ede2
5 changed files with 36 additions and 32 deletions

View File

@ -42,6 +42,8 @@ from libc.stdint cimport uint64_t
cimport lang
from spacy import util
from spacy import orth
TAG_THRESH = 0.5
@ -78,6 +80,11 @@ CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
NR_VIEWS = 0
CANON_CASED = NR_VIEWS; NR_VIEWS += 1
SHAPE = NR_VIEWS; NR_VIEWS += 1
NON_SPARSE = NR_VIEWS; NR_VIEWS += 1
cdef class English(Language):
"""English tokenizer, tightly coupled to lexicon.
@ -87,8 +94,8 @@ cdef class English(Language):
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def __cinit__(self, name):
flag_funcs = [0 for _ in range(NR_FLAGS)]
def __cinit__(self, name, string_features, flag_features):
flag_funcs = [None for _ in range(NR_FLAGS)]
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
@ -98,6 +105,7 @@ cdef class English(Language):
flag_funcs[IS_DIGIT] = orth.is_digit
flag_funcs[IS_PUNCT] = orth.is_punct
flag_funcs[IS_SPACE] = orth.is_space
flag_funcs[IS_ASCII] = orth.is_ascii
flag_funcs[IS_TITLE] = orth.is_title
flag_funcs[IS_LOWER] = orth.is_lower
flag_funcs[IS_UPPER] = orth.is_upper
@ -108,13 +116,25 @@ cdef class English(Language):
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
flag_funcs[CAN_ADV] = orth.can_tag('ADV', TAG_THRESH)
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
flag_funcs[CAN_PRON] = orth.can_tag('PRON', TAG_THRESH)
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
Language.__init__(self, name, flag_funcs)
string_funcs = [None for _ in range(NR_VIEWS)]
string_funcs[CANON_CASED] = orth.canon_case
string_funcs[SHAPE] = orth.word_shape
string_funcs[NON_SPARSE] = orth.non_sparse
self.name = name
self.cache = {}
lang_data = util.read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
string_funcs, flag_funcs)
self._load_special_tokenization(rules)
cdef int _split_one(self, unicode word):
cdef size_t length = len(word)
@ -149,4 +169,4 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):
return not word[i].isalnum()
EN = English('en')
EN = English('en', [], [])

View File

@ -30,7 +30,7 @@ cdef class Language:
The language's name is used to look up default data-files, found in data/<name.
"""
def __cinit__(self, name, string_features=None, flag_features=None):
def __cinit__(self, name, string_features, flag_features):
if flag_features is None:
flag_features = []
if string_features is None:

View File

@ -53,6 +53,7 @@ cdef class Lexeme:
self.length = len(string)
self.string = string
self.views = []
for string_feature in string_features:
view = string_feature(string, prob, cluster, case_stats, tag_stats)
self.views.append(view)

View File

@ -1,18 +0,0 @@
import pytest
from spacy._hashing import FixedTable
def test_insert():
table = FixedTable(20)
table[5] = 10
assert table.bucket(5) == 5
assert table[4] == 0
assert table[5] == 10
def test_clobber():
table = FixedTable(10)
table[9] = 1
assert table.bucket(9) == 9
assert table.bucket(19) == 9

View File

@ -2,26 +2,27 @@ from __future__ import unicode_literals
import pytest
from spacy.en import lookup, unhash
import spacy.word
from spacy import en
EN = en.EN
@pytest.fixture
def C3P0():
return lookup("C3P0")
return EN.lookup("C3P0")
def test_shape(C3P0):
# TODO: Fix this
assert unhash(C3P0.get_view(2)) == "XdXd"
assert C3P0.string_view(en.SHAPE) == "XdXd"
def test_length():
t = lookup('the')
t = EN.lookup('the')
assert t.length == 3
t = lookup("n't")
t = EN.lookup("n't")
assert t.length == 3
t = lookup("'s")
t = EN.lookup("'s")
assert t.length == 2
t = lookup('Xxxx')
t = EN.lookup('Xxxx')
assert t.length == 4