mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Begin testing more functionality
This commit is contained in:
parent
3e3ff99ca0
commit
dcab14ede2
30
spacy/en.pyx
30
spacy/en.pyx
|
@ -42,6 +42,8 @@ from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
cimport lang
|
cimport lang
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
|
|
||||||
TAG_THRESH = 0.5
|
TAG_THRESH = 0.5
|
||||||
|
@ -78,6 +80,11 @@ CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
||||||
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
||||||
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
NR_VIEWS = 0
|
||||||
|
CANON_CASED = NR_VIEWS; NR_VIEWS += 1
|
||||||
|
SHAPE = NR_VIEWS; NR_VIEWS += 1
|
||||||
|
NON_SPARSE = NR_VIEWS; NR_VIEWS += 1
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
"""English tokenizer, tightly coupled to lexicon.
|
"""English tokenizer, tightly coupled to lexicon.
|
||||||
|
@ -87,8 +94,8 @@ cdef class English(Language):
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name, string_features, flag_features):
|
||||||
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
flag_funcs = [None for _ in range(NR_FLAGS)]
|
||||||
|
|
||||||
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
||||||
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
||||||
|
@ -98,6 +105,7 @@ cdef class English(Language):
|
||||||
flag_funcs[IS_DIGIT] = orth.is_digit
|
flag_funcs[IS_DIGIT] = orth.is_digit
|
||||||
flag_funcs[IS_PUNCT] = orth.is_punct
|
flag_funcs[IS_PUNCT] = orth.is_punct
|
||||||
flag_funcs[IS_SPACE] = orth.is_space
|
flag_funcs[IS_SPACE] = orth.is_space
|
||||||
|
flag_funcs[IS_ASCII] = orth.is_ascii
|
||||||
flag_funcs[IS_TITLE] = orth.is_title
|
flag_funcs[IS_TITLE] = orth.is_title
|
||||||
flag_funcs[IS_LOWER] = orth.is_lower
|
flag_funcs[IS_LOWER] = orth.is_lower
|
||||||
flag_funcs[IS_UPPER] = orth.is_upper
|
flag_funcs[IS_UPPER] = orth.is_upper
|
||||||
|
@ -108,13 +116,25 @@ cdef class English(Language):
|
||||||
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
||||||
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
||||||
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_ADV] = orth.can_tag('ADV', TAG_THRESH)
|
||||||
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
||||||
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
||||||
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
||||||
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_PRON] = orth.can_tag('PRON', TAG_THRESH)
|
||||||
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
||||||
|
|
||||||
Language.__init__(self, name, flag_funcs)
|
string_funcs = [None for _ in range(NR_VIEWS)]
|
||||||
|
string_funcs[CANON_CASED] = orth.canon_case
|
||||||
|
string_funcs[SHAPE] = orth.word_shape
|
||||||
|
string_funcs[NON_SPARSE] = orth.non_sparse
|
||||||
|
self.name = name
|
||||||
|
self.cache = {}
|
||||||
|
lang_data = util.read_lang_data(name)
|
||||||
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
|
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
string_funcs, flag_funcs)
|
||||||
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
cdef int _split_one(self, unicode word):
|
cdef int _split_one(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
|
@ -149,4 +169,4 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):
|
||||||
return not word[i].isalnum()
|
return not word[i].isalnum()
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en', [], [])
|
||||||
|
|
|
@ -30,7 +30,7 @@ cdef class Language:
|
||||||
|
|
||||||
The language's name is used to look up default data-files, found in data/<name.
|
The language's name is used to look up default data-files, found in data/<name.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, name, string_features=None, flag_features=None):
|
def __cinit__(self, name, string_features, flag_features):
|
||||||
if flag_features is None:
|
if flag_features is None:
|
||||||
flag_features = []
|
flag_features = []
|
||||||
if string_features is None:
|
if string_features is None:
|
||||||
|
|
|
@ -53,6 +53,7 @@ cdef class Lexeme:
|
||||||
self.length = len(string)
|
self.length = len(string)
|
||||||
self.string = string
|
self.string = string
|
||||||
|
|
||||||
|
self.views = []
|
||||||
for string_feature in string_features:
|
for string_feature in string_features:
|
||||||
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
||||||
self.views.append(view)
|
self.views.append(view)
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy._hashing import FixedTable
|
|
||||||
|
|
||||||
|
|
||||||
def test_insert():
|
|
||||||
table = FixedTable(20)
|
|
||||||
table[5] = 10
|
|
||||||
assert table.bucket(5) == 5
|
|
||||||
assert table[4] == 0
|
|
||||||
assert table[5] == 10
|
|
||||||
|
|
||||||
def test_clobber():
|
|
||||||
table = FixedTable(10)
|
|
||||||
table[9] = 1
|
|
||||||
assert table.bucket(9) == 9
|
|
||||||
assert table.bucket(19) == 9
|
|
||||||
|
|
|
@ -2,26 +2,27 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.en import lookup, unhash
|
|
||||||
import spacy.word
|
import spacy.word
|
||||||
|
from spacy import en
|
||||||
|
|
||||||
|
EN = en.EN
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def C3P0():
|
def C3P0():
|
||||||
return lookup("C3P0")
|
return EN.lookup("C3P0")
|
||||||
|
|
||||||
|
|
||||||
def test_shape(C3P0):
|
def test_shape(C3P0):
|
||||||
# TODO: Fix this
|
assert C3P0.string_view(en.SHAPE) == "XdXd"
|
||||||
assert unhash(C3P0.get_view(2)) == "XdXd"
|
|
||||||
|
|
||||||
|
|
||||||
def test_length():
|
def test_length():
|
||||||
t = lookup('the')
|
t = EN.lookup('the')
|
||||||
assert t.length == 3
|
assert t.length == 3
|
||||||
t = lookup("n't")
|
t = EN.lookup("n't")
|
||||||
assert t.length == 3
|
assert t.length == 3
|
||||||
t = lookup("'s")
|
t = EN.lookup("'s")
|
||||||
assert t.length == 2
|
assert t.length == 2
|
||||||
t = lookup('Xxxx')
|
t = EN.lookup('Xxxx')
|
||||||
assert t.length == 4
|
assert t.length == 4
|
||||||
|
|
Loading…
Reference in New Issue
Block a user