mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Begin testing more functionality
This commit is contained in:
parent
3e3ff99ca0
commit
dcab14ede2
30
spacy/en.pyx
30
spacy/en.pyx
|
@ -42,6 +42,8 @@ from libc.stdint cimport uint64_t
|
|||
|
||||
cimport lang
|
||||
|
||||
from spacy import util
|
||||
|
||||
from spacy import orth
|
||||
|
||||
TAG_THRESH = 0.5
|
||||
|
@ -78,6 +80,11 @@ CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
|||
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
NR_VIEWS = 0
|
||||
CANON_CASED = NR_VIEWS; NR_VIEWS += 1
|
||||
SHAPE = NR_VIEWS; NR_VIEWS += 1
|
||||
NON_SPARSE = NR_VIEWS; NR_VIEWS += 1
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
"""English tokenizer, tightly coupled to lexicon.
|
||||
|
@ -87,8 +94,8 @@ cdef class English(Language):
|
|||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
|
||||
def __cinit__(self, name):
|
||||
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
||||
def __cinit__(self, name, string_features, flag_features):
|
||||
flag_funcs = [None for _ in range(NR_FLAGS)]
|
||||
|
||||
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
||||
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
||||
|
@ -98,6 +105,7 @@ cdef class English(Language):
|
|||
flag_funcs[IS_DIGIT] = orth.is_digit
|
||||
flag_funcs[IS_PUNCT] = orth.is_punct
|
||||
flag_funcs[IS_SPACE] = orth.is_space
|
||||
flag_funcs[IS_ASCII] = orth.is_ascii
|
||||
flag_funcs[IS_TITLE] = orth.is_title
|
||||
flag_funcs[IS_LOWER] = orth.is_lower
|
||||
flag_funcs[IS_UPPER] = orth.is_upper
|
||||
|
@ -108,13 +116,25 @@ cdef class English(Language):
|
|||
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
||||
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
||||
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
||||
flag_funcs[CAN_ADV] = orth.can_tag('ADV', TAG_THRESH)
|
||||
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
||||
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
||||
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
||||
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
||||
flag_funcs[CAN_PRON] = orth.can_tag('PRON', TAG_THRESH)
|
||||
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
||||
|
||||
Language.__init__(self, name, flag_funcs)
|
||||
|
||||
string_funcs = [None for _ in range(NR_VIEWS)]
|
||||
string_funcs[CANON_CASED] = orth.canon_case
|
||||
string_funcs[SHAPE] = orth.word_shape
|
||||
string_funcs[NON_SPARSE] = orth.non_sparse
|
||||
self.name = name
|
||||
self.cache = {}
|
||||
lang_data = util.read_lang_data(name)
|
||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||
string_funcs, flag_funcs)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
cdef int _split_one(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
|
@ -149,4 +169,4 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):
|
|||
return not word[i].isalnum()
|
||||
|
||||
|
||||
EN = English('en')
|
||||
EN = English('en', [], [])
|
||||
|
|
|
@ -30,7 +30,7 @@ cdef class Language:
|
|||
|
||||
The language's name is used to look up default data-files, found in data/<name.
|
||||
"""
|
||||
def __cinit__(self, name, string_features=None, flag_features=None):
|
||||
def __cinit__(self, name, string_features, flag_features):
|
||||
if flag_features is None:
|
||||
flag_features = []
|
||||
if string_features is None:
|
||||
|
|
|
@ -53,6 +53,7 @@ cdef class Lexeme:
|
|||
self.length = len(string)
|
||||
self.string = string
|
||||
|
||||
self.views = []
|
||||
for string_feature in string_features:
|
||||
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
||||
self.views.append(view)
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from spacy._hashing import FixedTable
|
||||
|
||||
|
||||
def test_insert():
|
||||
table = FixedTable(20)
|
||||
table[5] = 10
|
||||
assert table.bucket(5) == 5
|
||||
assert table[4] == 0
|
||||
assert table[5] == 10
|
||||
|
||||
def test_clobber():
|
||||
table = FixedTable(10)
|
||||
table[9] = 1
|
||||
assert table.bucket(9) == 9
|
||||
assert table.bucket(19) == 9
|
||||
|
|
@ -2,26 +2,27 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
from spacy.en import lookup, unhash
|
||||
import spacy.word
|
||||
from spacy import en
|
||||
|
||||
EN = en.EN
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def C3P0():
|
||||
return lookup("C3P0")
|
||||
return EN.lookup("C3P0")
|
||||
|
||||
|
||||
def test_shape(C3P0):
|
||||
# TODO: Fix this
|
||||
assert unhash(C3P0.get_view(2)) == "XdXd"
|
||||
assert C3P0.string_view(en.SHAPE) == "XdXd"
|
||||
|
||||
|
||||
def test_length():
|
||||
t = lookup('the')
|
||||
t = EN.lookup('the')
|
||||
assert t.length == 3
|
||||
t = lookup("n't")
|
||||
t = EN.lookup("n't")
|
||||
assert t.length == 3
|
||||
t = lookup("'s")
|
||||
t = EN.lookup("'s")
|
||||
assert t.length == 2
|
||||
t = lookup('Xxxx')
|
||||
t = EN.lookup('Xxxx')
|
||||
assert t.length == 4
|
||||
|
|
Loading…
Reference in New Issue
Block a user