mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Refactor to use tokens class.
This commit is contained in:
parent
cf412adba8
commit
7c09c73a14
|
@ -218,6 +218,9 @@ cdef class English(Language):
|
|||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
fl_is_alpha = Flag_IsAlpha
|
||||
fl_is_digit = Flag_IsDigit
|
||||
v_shape = View_WordShape
|
||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||
self.cache = {}
|
||||
lang_data = util.read_lang_data(name)
|
||||
|
@ -226,7 +229,7 @@ cdef class English(Language):
|
|||
STRING_VIEW_FUNCS + user_string_features,
|
||||
FLAG_FUNCS + user_flag_features)
|
||||
self._load_special_tokenization(rules)
|
||||
self.token_class = EnglishTokens
|
||||
self.tokens_class = EnglishTokens
|
||||
|
||||
cdef int _split_one(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
|
|
|
@ -43,7 +43,7 @@ cdef class Language:
|
|||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||
string_features, flag_features)
|
||||
self._load_special_tokenization(rules)
|
||||
self.token_class = Tokens
|
||||
self.tokens_class = Tokens
|
||||
|
||||
property nr_types:
|
||||
def __get__(self):
|
||||
|
@ -81,7 +81,7 @@ cdef class Language:
|
|||
cdef size_t length = len(string)
|
||||
cdef size_t start = 0
|
||||
cdef size_t i = 0
|
||||
cdef Tokens tokens = self.token_class()
|
||||
cdef Tokens tokens = self.tokens_class()
|
||||
for c in string:
|
||||
if c == ' ':
|
||||
if start < i:
|
||||
|
@ -91,7 +91,7 @@ cdef class Language:
|
|||
if start < i:
|
||||
self._tokenize(tokens, string[start:i])
|
||||
assert tokens
|
||||
return tokens
|
||||
return tokens.lexemes
|
||||
|
||||
cdef _tokenize(self, Tokens tokens, unicode string):
|
||||
cdef list lexemes
|
||||
|
|
|
@ -56,7 +56,7 @@ def oft_case(name, thresh):
|
|||
return wrapped
|
||||
|
||||
|
||||
def can_tag(name, thresh):
|
||||
def can_tag(name, thresh=0.5):
|
||||
def wrapped(string, prob, case_stats, tag_stats):
|
||||
return string
|
||||
return wrapped
|
||||
|
@ -111,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
|||
return word_shape(string, prob, cluster, case_stats, tag_stats)
|
||||
|
||||
|
||||
def asciied(string):
|
||||
def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
|
||||
'''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.'''
|
||||
# Snippet from
|
||||
# http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
|
||||
|
|
|
@ -3,16 +3,16 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.orth import asciify
|
||||
from spacy.orth import asciied
|
||||
|
||||
|
||||
def test_tilde():
|
||||
string = u'hõmbre'
|
||||
assert asciify(string) == u'hombre'
|
||||
assert asciied(string) == u'hombre'
|
||||
|
||||
|
||||
def test_smart_quote():
|
||||
string = u'“'
|
||||
assert asciify(string) == '"'
|
||||
assert asciied(string) == '"'
|
||||
string = u'”'
|
||||
assert asciify(string) == '"'
|
||||
assert asciied(string) == '"'
|
||||
|
|
|
@ -3,9 +3,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
import spacy.word
|
||||
from spacy import en
|
||||
|
||||
EN = en.EN
|
||||
from spacy.en import EN
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -14,7 +12,7 @@ def C3P0():
|
|||
|
||||
|
||||
def test_shape(C3P0):
|
||||
assert C3P0.string_view(en.SHAPE) == "XdXd"
|
||||
assert C3P0.string_view(EN.v_shape) == "XdXd"
|
||||
|
||||
|
||||
def test_length():
|
||||
|
|
Loading…
Reference in New Issue
Block a user