mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Refactor to use tokens class.
This commit is contained in:
parent
cf412adba8
commit
7c09c73a14
|
@ -218,6 +218,9 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
|
fl_is_alpha = Flag_IsAlpha
|
||||||
|
fl_is_digit = Flag_IsDigit
|
||||||
|
v_shape = View_WordShape
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
|
@ -226,7 +229,7 @@ cdef class English(Language):
|
||||||
STRING_VIEW_FUNCS + user_string_features,
|
STRING_VIEW_FUNCS + user_string_features,
|
||||||
FLAG_FUNCS + user_flag_features)
|
FLAG_FUNCS + user_flag_features)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
self.token_class = EnglishTokens
|
self.tokens_class = EnglishTokens
|
||||||
|
|
||||||
cdef int _split_one(self, unicode word):
|
cdef int _split_one(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
|
|
|
@ -43,7 +43,7 @@ cdef class Language:
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
string_features, flag_features)
|
string_features, flag_features)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
self.token_class = Tokens
|
self.tokens_class = Tokens
|
||||||
|
|
||||||
property nr_types:
|
property nr_types:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -81,7 +81,7 @@ cdef class Language:
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef Tokens tokens = self.token_class()
|
cdef Tokens tokens = self.tokens_class()
|
||||||
for c in string:
|
for c in string:
|
||||||
if c == ' ':
|
if c == ' ':
|
||||||
if start < i:
|
if start < i:
|
||||||
|
@ -91,7 +91,7 @@ cdef class Language:
|
||||||
if start < i:
|
if start < i:
|
||||||
self._tokenize(tokens, string[start:i])
|
self._tokenize(tokens, string[start:i])
|
||||||
assert tokens
|
assert tokens
|
||||||
return tokens
|
return tokens.lexemes
|
||||||
|
|
||||||
cdef _tokenize(self, Tokens tokens, unicode string):
|
cdef _tokenize(self, Tokens tokens, unicode string):
|
||||||
cdef list lexemes
|
cdef list lexemes
|
||||||
|
|
|
@ -56,7 +56,7 @@ def oft_case(name, thresh):
|
||||||
return wrapped
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
def can_tag(name, thresh):
|
def can_tag(name, thresh=0.5):
|
||||||
def wrapped(string, prob, case_stats, tag_stats):
|
def wrapped(string, prob, case_stats, tag_stats):
|
||||||
return string
|
return string
|
||||||
return wrapped
|
return wrapped
|
||||||
|
@ -111,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
||||||
return word_shape(string, prob, cluster, case_stats, tag_stats)
|
return word_shape(string, prob, cluster, case_stats, tag_stats)
|
||||||
|
|
||||||
|
|
||||||
def asciied(string):
|
def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
|
||||||
'''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.'''
|
'''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.'''
|
||||||
# Snippet from
|
# Snippet from
|
||||||
# http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
|
# http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
|
||||||
|
|
|
@ -3,16 +3,16 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.orth import asciify
|
from spacy.orth import asciied
|
||||||
|
|
||||||
|
|
||||||
def test_tilde():
|
def test_tilde():
|
||||||
string = u'hõmbre'
|
string = u'hõmbre'
|
||||||
assert asciify(string) == u'hombre'
|
assert asciied(string) == u'hombre'
|
||||||
|
|
||||||
|
|
||||||
def test_smart_quote():
|
def test_smart_quote():
|
||||||
string = u'“'
|
string = u'“'
|
||||||
assert asciify(string) == '"'
|
assert asciied(string) == '"'
|
||||||
string = u'”'
|
string = u'”'
|
||||||
assert asciify(string) == '"'
|
assert asciied(string) == '"'
|
||||||
|
|
|
@ -3,9 +3,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import spacy.word
|
import spacy.word
|
||||||
from spacy import en
|
from spacy.en import EN
|
||||||
|
|
||||||
EN = en.EN
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -14,7 +12,7 @@ def C3P0():
|
||||||
|
|
||||||
|
|
||||||
def test_shape(C3P0):
|
def test_shape(C3P0):
|
||||||
assert C3P0.string_view(en.SHAPE) == "XdXd"
|
assert C3P0.string_view(EN.v_shape) == "XdXd"
|
||||||
|
|
||||||
|
|
||||||
def test_length():
|
def test_length():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user