mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Redesign proceeding
This commit is contained in:
parent
fd4e61e58b
commit
c282e6d5fb
108
spacy/en.pyx
108
spacy/en.pyx
|
@ -45,8 +45,71 @@ cimport lang
|
||||||
|
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
|
|
||||||
|
TAG_THRESH = 0.5
|
||||||
|
UPPER_THRESH = 0.2
|
||||||
|
LOWER_THRESH = 0.5
|
||||||
|
TITLE_THRESH = 0.7
|
||||||
|
|
||||||
|
NR_FLAGS = 0
|
||||||
|
|
||||||
|
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
|
def __cinit__(self, name):
|
||||||
|
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
||||||
|
|
||||||
|
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
||||||
|
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
||||||
|
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
||||||
|
|
||||||
|
flag_funcs[IS_ALPHA] = orth.is_alpha
|
||||||
|
flag_funcs[IS_DIGIT] = orth.is_digit
|
||||||
|
flag_funcs[IS_PUNCT] = orth.is_punct
|
||||||
|
flag_funcs[IS_SPACE] = orth.is_space
|
||||||
|
flag_funcs[IS_TITLE] = orth.is_title
|
||||||
|
flag_funcs[IS_LOWER] = orth.is_lower
|
||||||
|
flag_funcs[IS_UPPER] = orth.is_upper
|
||||||
|
|
||||||
|
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
||||||
|
|
||||||
|
Language.__init__(self, name, flag_funcs)
|
||||||
|
|
||||||
cpdef int _split_one(self, unicode word):
|
cpdef int _split_one(self, unicode word):
|
||||||
cdef size_t length = len(word)
|
cdef size_t length = len(word)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -81,48 +144,3 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
||||||
|
|
||||||
# Thresholds for frequency related flags
|
|
||||||
cdef double TAG_THRESH = 0.5
|
|
||||||
cdef double LOWER_THRESH = 0.5
|
|
||||||
cdef double UPPER_THRESH = 0.3
|
|
||||||
cdef double TITLE_THRESH = 0.9
|
|
||||||
|
|
||||||
|
|
||||||
# Python-readable flag constants --- can't read an enum from Python
|
|
||||||
ALPHA = EN.lexicon.add_flag(orth.is_alpha)
|
|
||||||
DIGIT = EN.lexicon.add_flag(orth.is_digit)
|
|
||||||
PUNCT = EN.lexicon.add_flag(orth.is_punct)
|
|
||||||
SPACE = EN.lexicon.add_flag(orth.is_space)
|
|
||||||
PUNCT = EN.lexicon.add_flag(orth.is_punct)
|
|
||||||
ASCII = EN.lexicon.add_flag(orth.is_ascii)
|
|
||||||
TITLE = EN.lexicon.add_flag(orth.is_title)
|
|
||||||
LOWER = EN.lexicon.add_flag(orth.is_lower)
|
|
||||||
UPPER = EN.lexicon.add_flag(orth.is_upper)
|
|
||||||
|
|
||||||
OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH))
|
|
||||||
OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH))
|
|
||||||
OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH))
|
|
||||||
|
|
||||||
CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH))
|
|
||||||
CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH))
|
|
||||||
CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH))
|
|
||||||
CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH))
|
|
||||||
CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH))
|
|
||||||
CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH))
|
|
||||||
CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH))
|
|
||||||
CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH))
|
|
||||||
CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH))
|
|
||||||
CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH))
|
|
||||||
CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH))
|
|
||||||
CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH))
|
|
||||||
CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH))
|
|
||||||
CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH))
|
|
||||||
|
|
||||||
|
|
||||||
# These are the name of string transforms
|
|
||||||
SIC = EN.lexicon.add_transform(orth.sic_string)
|
|
||||||
CANON_CASED = EN.lexicon.add_transform(orth.canon_case)
|
|
||||||
SHAPE = EN.lexicon.add_transform(orth.word_shape)
|
|
||||||
NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse)
|
|
||||||
|
|
|
@ -4,14 +4,10 @@ from spacy.word cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cdef public dict probs
|
cdef list string_features
|
||||||
cdef public dict clusters
|
cdef list flag_features
|
||||||
cdef public dict case_stats
|
|
||||||
cdef public dict tag_stats
|
|
||||||
cdef public list flag_checkers
|
|
||||||
cdef public list string_transformers
|
|
||||||
|
|
||||||
cdef dict lexicon
|
cdef dict _dict
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
|
|
||||||
|
@ -26,4 +22,3 @@ cdef class Language:
|
||||||
cdef list _tokenize(self, unicode string)
|
cdef list _tokenize(self, unicode string)
|
||||||
cpdef list _split(self, unicode string)
|
cpdef list _split(self, unicode string)
|
||||||
cpdef int _split_one(self, unicode word)
|
cpdef int _split_one(self, unicode word)
|
||||||
|
|
||||||
|
|
118
spacy/lang.pyx
118
spacy/lang.pyx
|
@ -10,17 +10,38 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
|
||||||
from . import util
|
|
||||||
import json
|
import json
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
from .util import read_lang_data
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
"""Base class for language-specific tokenizers.
|
||||||
|
|
||||||
|
Most subclasses will override the _split or _split_one methods, which take
|
||||||
|
a string of non-whitespace characters and output a list of strings. This
|
||||||
|
function is called by _tokenize, which sits behind a cache and turns the
|
||||||
|
list of strings into Lexeme objects via the Lexicon. Most languages will not
|
||||||
|
need to override _tokenize or tokenize.
|
||||||
|
|
||||||
|
The language is supplied a list of boolean functions, used to compute flag
|
||||||
|
features. These are passed to the language's Lexicon object.
|
||||||
|
|
||||||
|
The language's name is used to look up default data-files, found in data/<name.
|
||||||
|
"""
|
||||||
|
def __cinit__(self, name, string_features=None, flag_features=None):
|
||||||
|
if flag_features is None:
|
||||||
|
flag_features = []
|
||||||
|
if string_features is None:
|
||||||
|
string_features = []
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
self.lexicon = Lexicon()
|
lang_data = read_lang_data(name)
|
||||||
self.load_special_tokenization(util.read_tokenization(name))
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
string_features, flag_features)
|
||||||
|
self.load_special_tokenization(rules)
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef list tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
@ -37,6 +58,8 @@ cdef class Language:
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
"""
|
"""
|
||||||
|
if not string:
|
||||||
|
return []
|
||||||
cdef list tokens = []
|
cdef list tokens = []
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
|
@ -107,85 +130,32 @@ cdef class Language:
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __cinit__(self):
|
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
|
||||||
self.flag_checkers = []
|
string_features, flag_features):
|
||||||
self.string_transformers = []
|
self.flag_features = flag_features
|
||||||
self.probs = {}
|
self.string_features = string_features
|
||||||
self.clusters = {}
|
self._dict = {}
|
||||||
self.case_stats = {}
|
cdef Lexeme word
|
||||||
self.tag_stats = {}
|
for string in words:
|
||||||
self.lexicon = {}
|
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
|
||||||
|
case_stats.get(string, {}), tag_stats.get(string, {}),
|
||||||
|
self.string_features, self.flag_features)
|
||||||
|
self._dict[string] = word
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
|
||||||
Args:
|
Args
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
"""
|
"""
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
if string in self.lexicon:
|
if string in self._dict:
|
||||||
return self.lexicon[string]
|
return self._dict[string]
|
||||||
|
|
||||||
prob = _pop_default(self.probs, string, 0.0)
|
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
|
||||||
cluster = _pop_default(self.clusters, string, 0.0)
|
self.flag_features)
|
||||||
case_stats = _pop_default(self.case_stats, string, {})
|
self._dict[string] = word
|
||||||
tag_stats = _pop_default(self.tag_stats, string, {})
|
|
||||||
|
|
||||||
cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
|
|
||||||
self.flag_checkers, self.string_transformers)
|
|
||||||
self.lexicon[string] = word
|
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def add_flag(self, flag_checker):
|
|
||||||
cdef unicode string
|
|
||||||
cdef Lexeme word
|
|
||||||
flag_id = len(self.flag_checkers)
|
|
||||||
for string, word in self.lexicon.items():
|
|
||||||
if flag_checker(string, word.prob, {}, {}):
|
|
||||||
word.set_flag(flag_id)
|
|
||||||
self.flag_checkers.append(flag_checker)
|
|
||||||
return flag_id
|
|
||||||
|
|
||||||
def add_transform(self, string_transform):
|
|
||||||
self.string_transformers.append(string_transform)
|
|
||||||
for string, word in self.lexicon.items():
|
|
||||||
word.add_view(string_transform(string, word.prob, {}, {}))
|
|
||||||
return len(self.string_transformers) - 1
|
|
||||||
|
|
||||||
def load_probs(self, location):
|
|
||||||
"""Load unigram probabilities.
|
|
||||||
"""
|
|
||||||
# Dict mapping words to floats
|
|
||||||
self.probs = json.load(location)
|
|
||||||
|
|
||||||
cdef Lexeme word
|
|
||||||
cdef unicode string
|
|
||||||
|
|
||||||
for string, word in self.lexicon.items():
|
|
||||||
prob = _pop_default(self.probs, string, 0.0)
|
|
||||||
word.prob = prob
|
|
||||||
|
|
||||||
def load_clusters(self, location):
|
|
||||||
# TODO: Find out endianness
|
|
||||||
# Dict mapping words to ??-endian ints
|
|
||||||
self.clusters = json.load(location)
|
|
||||||
|
|
||||||
cdef Lexeme word
|
|
||||||
cdef unicode string
|
|
||||||
|
|
||||||
for string, word in self.lexicon.items():
|
|
||||||
cluster = _pop_default(self.clusters, string, 0)
|
|
||||||
word.cluster = cluster
|
|
||||||
|
|
||||||
def load_stats(self, location):
|
|
||||||
"""Load distributional stats.
|
|
||||||
"""
|
|
||||||
# Dict mapping string to dict of arbitrary stuff.
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
def _pop_default(dict d, key, default):
|
|
||||||
return d.pop(key) if key in d else default
|
|
||||||
|
|
|
@ -7,19 +7,14 @@ DEF MAX_FLAG = 64
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
||||||
# writeable from Cython.
|
# writeable from Cython.
|
||||||
cpdef readonly id_t id
|
|
||||||
cpdef readonly size_t length
|
cpdef readonly size_t length
|
||||||
cpdef readonly double prob
|
cpdef readonly double prob
|
||||||
cpdef readonly size_t cluster
|
cpdef readonly size_t cluster
|
||||||
|
|
||||||
cdef list views
|
cpdef readonly string
|
||||||
cdef size_t nr_views
|
cpdef readonly list views
|
||||||
|
|
||||||
cdef readonly flag_t flags
|
cdef readonly flag_t flags
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *
|
cpdef bint check_flag(self, size_t flag_id) except *
|
||||||
cpdef int set_flag(self, size_t flag_id) except -1
|
cpdef int set_flag(self, size_t flag_id) except -1
|
||||||
|
|
||||||
cpdef unicode get_view_string(self, size_t i)
|
|
||||||
cpdef id_t get_view_id(self, size_t i) except 0
|
|
||||||
cpdef int add_view(self, unicode view) except -1
|
|
||||||
|
|
|
@ -49,42 +49,24 @@ cdef class Lexeme:
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, unicode string, prob, cluster, case_stats,
|
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
|
||||||
tag_stats, flag_checkers, string_transformers):
|
dict tag_stats, list string_features, list flag_features):
|
||||||
self.prob = prob
|
self.prob = prob
|
||||||
self.cluster = cluster
|
self.cluster = cluster
|
||||||
self.length = len(string)
|
self.length = len(string)
|
||||||
self.id = hash(string)
|
self.string = string
|
||||||
|
|
||||||
self.nr_views = len(string_transformers)
|
for string_feature in string_features:
|
||||||
self.views = []
|
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
||||||
cdef unicode view
|
|
||||||
for i, string_transformer in enumerate(string_transformers):
|
|
||||||
view = string_transformer(string, prob, case_stats, tag_stats)
|
|
||||||
self.views.append(view)
|
self.views.append(view)
|
||||||
|
|
||||||
for i, flag_checker in enumerate(flag_checkers):
|
for i, flag_feature in enumerate(flag_features):
|
||||||
if flag_checker(string, prob, case_stats, tag_stats):
|
if flag_feature(string, prob, case_stats, tag_stats):
|
||||||
self.set_flag(i)
|
self.set_flag(i)
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
property string:
|
|
||||||
def __get__(self):
|
|
||||||
return self.views[0]
|
|
||||||
|
|
||||||
cpdef unicode get_view_string(self, size_t i):
|
|
||||||
assert i < self.nr_views
|
|
||||||
return self.views[i]
|
|
||||||
|
|
||||||
cpdef id_t get_view_id(self, size_t i) except 0:
|
|
||||||
return <id_t>hash(self.views[i])
|
|
||||||
|
|
||||||
cpdef int add_view(self, unicode view) except -1:
|
|
||||||
self.nr_views += 1
|
|
||||||
self.views.append(view)
|
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *:
|
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||||
"""Access the value of one of the pre-computed boolean distribution features.
|
"""Access the value of one of the pre-computed boolean distribution features.
|
||||||
|
|
||||||
|
|
|
@ -1,42 +1,41 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import tokenize
|
from spacy.en import EN
|
||||||
from spacy.en import lookup
|
|
||||||
|
|
||||||
|
|
||||||
def test_single_word():
|
def test_single_word():
|
||||||
lex_ids = tokenize(u'hello')
|
lex_ids = EN.tokenize(u'hello')
|
||||||
assert lex_ids[0] == lookup(u'hello')
|
assert lex_ids[0] == EN.lexicon.lookup(u'hello')
|
||||||
|
|
||||||
|
|
||||||
def test_two_words():
|
def test_two_words():
|
||||||
words = tokenize('hello possums')
|
words = EN.tokenize('hello possums')
|
||||||
assert len(words) == 2
|
assert len(words) == 2
|
||||||
assert words[0] == lookup('hello')
|
assert words[0] == EN.lexicon.lookup('hello')
|
||||||
assert words[0] != words[1]
|
assert words[0] != words[1]
|
||||||
|
|
||||||
|
|
||||||
def test_punct():
|
def test_punct():
|
||||||
tokens = tokenize('hello, possums.')
|
tokens = EN.tokenize('hello, possums.')
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert tokens[0].lex == lookup('hello').lex
|
assert tokens[0].string == EN.lexicon.lookup('hello').string
|
||||||
assert tokens[1].lex == lookup(',').lex
|
assert tokens[1].string == EN.lexicon.lookup(',').string
|
||||||
assert tokens[2].lex == lookup('possums').lex
|
assert tokens[2].string == EN.lexicon.lookup('possums').string
|
||||||
assert tokens[1].lex != lookup('hello').lex
|
assert tokens[1].string != EN.lexicon.lookup('hello').string
|
||||||
|
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
lex_ids = tokenize('The year: 1984.')
|
lex_ids = EN.tokenize('The year: 1984.')
|
||||||
assert len(lex_ids) == 5
|
assert len(lex_ids) == 5
|
||||||
assert lex_ids[0].lex == lookup('The').lex
|
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||||
assert lex_ids[3].lex == lookup('1984').lex
|
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||||
assert lex_ids[4].lex == lookup('.').lex
|
assert lex_ids[4].string == EN.lexicon.lookup('.').string
|
||||||
|
|
||||||
|
|
||||||
def test_contraction():
|
def test_contraction():
|
||||||
lex_ids = tokenize("don't giggle")
|
lex_ids = EN.tokenize("don't giggle")
|
||||||
assert len(lex_ids) == 3
|
assert len(lex_ids) == 3
|
||||||
assert lex_ids[1].lex == lookup("not").lex
|
assert lex_ids[1].string == EN.lexicon.lookup("not").string
|
||||||
lex_ids = tokenize("i said don't!")
|
lex_ids = EN.tokenize("i said don't!")
|
||||||
assert len(lex_ids) == 4
|
assert len(lex_ids) == 4
|
||||||
assert lex_ids[3].lex == lookup('!').lex
|
assert lex_ids[3].string == EN.lexicon.lookup('!').string
|
||||||
|
|
Loading…
Reference in New Issue
Block a user