* Refactor to use tokens class.

This commit is contained in:
Matthew Honnibal 2014-09-10 18:27:44 +02:00
parent cf412adba8
commit 7c09c73a14
5 changed files with 15 additions and 14 deletions

View File

@ -218,6 +218,9 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language. name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method. lexicon (Lexicon): The lexicon. Exposes the lookup method.
""" """
fl_is_alpha = Flag_IsAlpha
fl_is_digit = Flag_IsDigit
v_shape = View_WordShape
def __cinit__(self, name, user_string_features, user_flag_features): def __cinit__(self, name, user_string_features, user_flag_features):
self.cache = {} self.cache = {}
lang_data = util.read_lang_data(name) lang_data = util.read_lang_data(name)
@ -226,7 +229,7 @@ cdef class English(Language):
STRING_VIEW_FUNCS + user_string_features, STRING_VIEW_FUNCS + user_string_features,
FLAG_FUNCS + user_flag_features) FLAG_FUNCS + user_flag_features)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self.token_class = EnglishTokens self.tokens_class = EnglishTokens
cdef int _split_one(self, unicode word): cdef int _split_one(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)

View File

@ -43,7 +43,7 @@ cdef class Language:
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
string_features, flag_features) string_features, flag_features)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self.token_class = Tokens self.tokens_class = Tokens
property nr_types: property nr_types:
def __get__(self): def __get__(self):
@ -81,7 +81,7 @@ cdef class Language:
cdef size_t length = len(string) cdef size_t length = len(string)
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
cdef Tokens tokens = self.token_class() cdef Tokens tokens = self.tokens_class()
for c in string: for c in string:
if c == ' ': if c == ' ':
if start < i: if start < i:
@ -91,7 +91,7 @@ cdef class Language:
if start < i: if start < i:
self._tokenize(tokens, string[start:i]) self._tokenize(tokens, string[start:i])
assert tokens assert tokens
return tokens return tokens.lexemes
cdef _tokenize(self, Tokens tokens, unicode string): cdef _tokenize(self, Tokens tokens, unicode string):
cdef list lexemes cdef list lexemes

View File

@ -56,7 +56,7 @@ def oft_case(name, thresh):
return wrapped return wrapped
def can_tag(name, thresh): def can_tag(name, thresh=0.5):
def wrapped(string, prob, case_stats, tag_stats): def wrapped(string, prob, case_stats, tag_stats):
return string return string
return wrapped return wrapped
@ -111,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats):
return word_shape(string, prob, cluster, case_stats, tag_stats) return word_shape(string, prob, cluster, case_stats, tag_stats)
def asciied(string): def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
'''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.''' '''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.'''
# Snippet from # Snippet from
# http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html # http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html

View File

@ -3,16 +3,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.orth import asciify from spacy.orth import asciied
def test_tilde(): def test_tilde():
string = u'hõmbre' string = u'hõmbre'
assert asciify(string) == u'hombre' assert asciied(string) == u'hombre'
def test_smart_quote(): def test_smart_quote():
string = u'' string = u''
assert asciify(string) == '"' assert asciied(string) == '"'
string = u'' string = u''
assert asciify(string) == '"' assert asciied(string) == '"'

View File

@ -3,9 +3,7 @@ from __future__ import unicode_literals
import pytest import pytest
import spacy.word import spacy.word
from spacy import en from spacy.en import EN
EN = en.EN
@pytest.fixture @pytest.fixture
@ -14,7 +12,7 @@ def C3P0():
def test_shape(C3P0): def test_shape(C3P0):
assert C3P0.string_view(en.SHAPE) == "XdXd" assert C3P0.string_view(EN.v_shape) == "XdXd"
def test_length(): def test_length():