mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas
This commit is contained in:
parent
cda9ea9a4a
commit
302e09018b
24
spacy/en.pxd
24
spacy/en.pxd
|
@ -10,6 +10,7 @@ cpdef enum en_person_t:
|
||||||
FIRST
|
FIRST
|
||||||
SECOND
|
SECOND
|
||||||
THIRD
|
THIRD
|
||||||
|
NON_THIRD
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_number_t:
|
cpdef enum en_number_t:
|
||||||
|
@ -17,14 +18,22 @@ cpdef enum en_number_t:
|
||||||
SINGULAR
|
SINGULAR
|
||||||
PLURAL
|
PLURAL
|
||||||
MASS
|
MASS
|
||||||
CARDINAL
|
|
||||||
ORDINAL
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_gender_t:
|
cpdef enum en_gender_t:
|
||||||
NO_GENDER
|
NO_GENDER
|
||||||
MASCULINE
|
MASCULINE
|
||||||
FEMININE
|
FEMININE
|
||||||
|
NEUTER
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_case_t:
|
||||||
|
NO_CASE
|
||||||
|
NOMINATIVE
|
||||||
|
GENITIVE
|
||||||
|
ACCUSATIVE
|
||||||
|
REFLEXIVE
|
||||||
|
DEMONYM
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_tenspect_t:
|
cpdef enum en_tenspect_t:
|
||||||
|
@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
|
||||||
MODAL
|
MODAL
|
||||||
|
|
||||||
|
|
||||||
cpdef enum en_case_t:
|
|
||||||
NO_CASE
|
|
||||||
NOMINATIVE
|
|
||||||
ACCUSATIVE
|
|
||||||
GENITIVE
|
|
||||||
DEMONYM
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum misc_t:
|
cpdef enum misc_t:
|
||||||
NO_MISC
|
NO_MISC
|
||||||
COMPARATIVE
|
COMPARATIVE
|
||||||
SUPERLATIVE
|
SUPERLATIVE
|
||||||
RELATIVE
|
RELATIVE
|
||||||
NAME
|
NAME
|
||||||
URL
|
|
||||||
EMAIL
|
|
||||||
EMOTICON
|
|
||||||
|
|
||||||
|
|
||||||
# Flags
|
# Flags
|
||||||
|
|
25
spacy/en.pyx
25
spacy/en.pyx
|
@ -38,6 +38,8 @@ import orth
|
||||||
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
from .tagger cimport X, PUNCT, EOL
|
from .tagger cimport X, PUNCT, EOL
|
||||||
|
|
||||||
|
from .tokens cimport Morphology
|
||||||
|
|
||||||
|
|
||||||
POS_TAGS = {
|
POS_TAGS = {
|
||||||
'NULL': (NO_TAG, {}),
|
'NULL': (NO_TAG, {}),
|
||||||
|
@ -152,7 +154,8 @@ cdef class English(Language):
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, t)
|
fill_pos_context(context, i, t)
|
||||||
t[i].pos = self.pos_tagger.predict(context)
|
t[i].pos = self.pos_tagger.predict(context)
|
||||||
#self.morphalyser.set_token(&t[i])
|
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
||||||
|
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
||||||
|
|
||||||
def train_pos(self, Tokens tokens, golds):
|
def train_pos(self, Tokens tokens, golds):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -162,11 +165,27 @@ cdef class English(Language):
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, t)
|
fill_pos_context(context, i, t)
|
||||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||||
t[i].morph = self.pos_tagger.tags[t[i].pos].morph
|
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
||||||
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
|
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
||||||
c += t[i].pos == golds[i]
|
c += t[i].pos == golds[i]
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
|
||||||
|
if tok_morph.number == 0:
|
||||||
|
tok_morph.number = pos_morph.number
|
||||||
|
if tok_morph.tenspect == 0:
|
||||||
|
tok_morph.tenspect = pos_morph.tenspect
|
||||||
|
if tok_morph.mood == 0:
|
||||||
|
tok_morph.mood = pos_morph.mood
|
||||||
|
if tok_morph.gender == 0:
|
||||||
|
tok_morph.gender = pos_morph.gender
|
||||||
|
if tok_morph.person == 0:
|
||||||
|
tok_morph.person = pos_morph.person
|
||||||
|
if tok_morph.case == 0:
|
||||||
|
tok_morph.case = pos_morph.case
|
||||||
|
if tok_morph.misc == 0:
|
||||||
|
tok_morph.misc = pos_morph.misc
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
|
@ -9,7 +9,7 @@ from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens, TokenC
|
from .tokens cimport Tokens, TokenC
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .tagger cimport Tagger
|
from .tagger cimport Tagger
|
||||||
from .tagger cimport PosTag
|
from .tagger cimport univ_tag_t
|
||||||
from .utf8string cimport StringStore, UniStr
|
from .utf8string cimport StringStore, UniStr
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,11 +38,12 @@ cdef class Language:
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings)
|
cpdef Tokens tokens_from_list(self, list strings)
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||||
vector[Lexeme*] *suffixes) except NULL
|
vector[Lexeme*] *suffixes) except NULL
|
||||||
|
|
112
spacy/lang.pyx
112
spacy/lang.pyx
|
@ -28,6 +28,7 @@ from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
|
||||||
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
||||||
|
from .tokens cimport Morphology
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -53,27 +54,27 @@ cdef class Language:
|
||||||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||||
|
|
||||||
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
|
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
||||||
if lemma != 0:
|
if lemma != 0:
|
||||||
return lemma
|
return lemma
|
||||||
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef bytes lemma_string
|
cdef bytes lemma_string
|
||||||
if pos.pos == NOUN:
|
if pos == NOUN:
|
||||||
lemma_strings = self.lemmatizer.noun(py_string)
|
lemma_strings = self.lemmatizer.noun(py_string)
|
||||||
elif pos.pos == VERB:
|
elif pos == VERB:
|
||||||
lemma_strings = self.lemmatizer.verb(py_string)
|
lemma_strings = self.lemmatizer.verb(py_string)
|
||||||
else:
|
else:
|
||||||
assert pos.pos == ADJ
|
assert pos == ADJ
|
||||||
lemma_strings = self.lemmatizer.adj(py_string)
|
lemma_strings = self.lemmatizer.adj(py_string)
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
||||||
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
|
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
||||||
return lemma
|
return lemma
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
|
@ -111,6 +112,7 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
|
cdef bint cache_hit
|
||||||
cdef Py_UNICODE* chars = string
|
cdef Py_UNICODE* chars = string
|
||||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||||
cdef UniStr span
|
cdef UniStr span
|
||||||
|
@ -118,10 +120,8 @@ cdef class Language:
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||||
if start < i:
|
if start < i:
|
||||||
slice_unicode(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
lexemes = <const Lexeme* const*>self._cache.get(span.key)
|
cache_hit = self._try_cache(start, span.key, tokens)
|
||||||
if lexemes != NULL:
|
if not cache_hit:
|
||||||
tokens.extend(start, lexemes, 0)
|
|
||||||
else:
|
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
start = i
|
start = i
|
||||||
|
@ -130,13 +130,32 @@ cdef class Language:
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
slice_unicode(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
lexemes = <const Lexeme* const*>self._cache.get(span.key)
|
cache_hit = self._try_cache(start, span.key, tokens)
|
||||||
if lexemes != NULL:
|
if not cache_hit:
|
||||||
tokens.extend(start, lexemes, 0)
|
|
||||||
else:
|
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||||
|
cdef int i
|
||||||
|
specials = <TokenC*>self._specials.get(key)
|
||||||
|
if specials != NULL:
|
||||||
|
i = 0
|
||||||
|
while specials[i].lex != NULL:
|
||||||
|
tokens.push_back(idx, specials[i].lex)
|
||||||
|
tokens.data[tokens.length - 1].pos = specials[i].pos
|
||||||
|
tokens.data[tokens.length - 1].morph = specials[i].morph
|
||||||
|
tokens.data[tokens.length - 1].lemma = specials[i].lemma
|
||||||
|
tokens.data[tokens.length - 1].sense = specials[i].sense
|
||||||
|
i += 1
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
cached = <const Lexeme* const*>self._cache.get(key)
|
||||||
|
if cached != NULL:
|
||||||
|
tokens.extend(i, cached, 0)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||||
cdef vector[Lexeme*] prefixes
|
cdef vector[Lexeme*] prefixes
|
||||||
cdef vector[Lexeme*] suffixes
|
cdef vector[Lexeme*] suffixes
|
||||||
|
@ -190,10 +209,10 @@ cdef class Language:
|
||||||
break
|
break
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Tokens tokens,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||||
int idx, UniStr* string,
|
|
||||||
vector[const Lexeme*] *prefixes,
|
vector[const Lexeme*] *prefixes,
|
||||||
vector[const Lexeme*] *suffixes) except -1:
|
vector[const Lexeme*] *suffixes) except -1:
|
||||||
|
cdef bint cache_hit
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef const Lexeme* const* lexemes
|
cdef const Lexeme* const* lexemes
|
||||||
cdef Lexeme* lexeme
|
cdef Lexeme* lexeme
|
||||||
|
@ -201,10 +220,9 @@ cdef class Language:
|
||||||
if prefixes.size():
|
if prefixes.size():
|
||||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
|
cache_hit = self._try_cache(idx, string.key, tokens)
|
||||||
lexemes = <const Lexeme* const*>self._cache.get(string.key)
|
if cache_hit:
|
||||||
if lexemes != NULL:
|
idx = tokens.data[tokens.length - 1].idx + 1
|
||||||
idx = tokens.extend(idx, lexemes, 0)
|
|
||||||
else:
|
else:
|
||||||
split = self._find_infix(string.chars, string.n)
|
split = self._find_infix(string.chars, string.n)
|
||||||
if split == 0 or split == -1:
|
if split == 0 or split == -1:
|
||||||
|
@ -247,30 +265,42 @@ cdef class Language:
|
||||||
match = self._suffix_re.search(string)
|
match = self._suffix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, token_rules):
|
def _load_special_tokenization(self, object rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Add a special-case tokenization rule.
|
||||||
|
|
||||||
Loads special-case tokenization rules into the Language._cache cache,
|
|
||||||
read from data/<lang>/tokenization . The special cases are loaded before
|
|
||||||
any language data is tokenized, giving these priority. For instance,
|
|
||||||
the English tokenization rules map "ain't" to ["are", "not"].
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
|
||||||
a string and tokens is a list of strings.
|
|
||||||
'''
|
'''
|
||||||
|
cdef int i
|
||||||
|
cdef unicode chunk
|
||||||
|
cdef list substrings
|
||||||
|
cdef unicode form
|
||||||
|
cdef unicode lemma
|
||||||
|
cdef dict props
|
||||||
cdef Lexeme** lexemes
|
cdef Lexeme** lexemes
|
||||||
cdef hash_t hashed
|
cdef hash_t hashed
|
||||||
cdef UniStr string
|
cdef UniStr string
|
||||||
for uni_string, substrings in token_rules:
|
for chunk, substrings in sorted(rules.items()):
|
||||||
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, substring in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
slice_unicode(&string, substring, 0, len(substring))
|
form = props['F']
|
||||||
lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
lemma = props.get("L", None)
|
||||||
lexemes[i + 1] = NULL
|
slice_unicode(&string, form, 0, len(form))
|
||||||
slice_unicode(&string, uni_string, 0, len(uni_string))
|
tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
||||||
self._specials.set(string.key, lexemes)
|
if lemma:
|
||||||
self._cache.set(string.key, lexemes)
|
tokens[i].lemma = self.lexicon.strings[lemma]
|
||||||
|
set_morph_from_dict(&tokens[i].morph, props)
|
||||||
|
# Null-terminated array
|
||||||
|
tokens[i+1].lex = NULL
|
||||||
|
slice_unicode(&string, chunk, 0, len(chunk))
|
||||||
|
self._specials.set(string.key, tokens)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||||
|
morph.number = props.get('number', 0)
|
||||||
|
morph.tenspect = props.get('tenspect', 0)
|
||||||
|
morph.mood = props.get('mood', 0)
|
||||||
|
morph.gender = props.get('gender', 0)
|
||||||
|
morph.person = props.get('person', 0)
|
||||||
|
morph.case = props.get('case', 0)
|
||||||
|
morph.misc = props.get('misc', 0)
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
|
|
@ -21,7 +21,6 @@ cdef struct Morphology:
|
||||||
uint8_t misc
|
uint8_t misc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const Lexeme* lex
|
const Lexeme* lex
|
||||||
Morphology morph
|
Morphology morph
|
||||||
|
|
|
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
def read_lang_data(name):
|
def read_lang_data(name):
|
||||||
data_dir = path.join(DATA_DIR, name)
|
data_dir = path.join(DATA_DIR, name)
|
||||||
tokenization = read_tokenization(name)
|
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||||
|
tokenization = ujson.load(file_)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
infix = read_infix(data_dir)
|
infix = read_infix(data_dir)
|
||||||
|
@ -26,12 +27,17 @@ def read_prefix(data_dir):
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_suffix(data_dir):
|
def read_suffix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
||||||
|
# TODO: Fix this hack!
|
||||||
|
expression += r'|(?<=[a-z0-9])\.$'
|
||||||
|
expression += r'|(?<=[0-9])km$'
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_infix(data_dir):
|
def read_infix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
|
|
|
@ -20,15 +20,18 @@ def test_apostrophe():
|
||||||
def test_LL():
|
def test_LL():
|
||||||
tokens = EN.tokenize("we'll")
|
tokens = EN.tokenize("we'll")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[1].string == "will"
|
assert tokens[1].string == "'ll"
|
||||||
|
assert tokens[1].lemma == "will"
|
||||||
assert tokens[0].string == "we"
|
assert tokens[0].string == "we"
|
||||||
|
|
||||||
|
|
||||||
def test_aint():
|
def test_aint():
|
||||||
tokens = EN.tokenize("ain't")
|
tokens = EN.tokenize("ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string == "are"
|
assert tokens[0].string == "ai"
|
||||||
assert tokens[1].string == "not"
|
assert tokens[0].lemma == "be"
|
||||||
|
assert tokens[1].string == "n't"
|
||||||
|
assert tokens[1].lemma == "not"
|
||||||
|
|
||||||
|
|
||||||
def test_capitalized():
|
def test_capitalized():
|
||||||
|
@ -38,7 +41,8 @@ def test_capitalized():
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens = EN.tokenize("Ain't")
|
tokens = EN.tokenize("Ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string == "Are"
|
assert tokens[0].string == "Ai"
|
||||||
|
assert tokens[0].lemma == "be"
|
||||||
|
|
||||||
|
|
||||||
def test_punct():
|
def test_punct():
|
||||||
|
|
|
@ -34,7 +34,7 @@ def test_digits():
|
||||||
def test_contraction():
|
def test_contraction():
|
||||||
tokens = EN.tokenize("don't giggle")
|
tokens = EN.tokenize("don't giggle")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[1].sic == EN.lexicon["not"]['sic']
|
assert tokens[1].sic == EN.lexicon["n't"]['sic']
|
||||||
tokens = EN.tokenize("i said don't!")
|
tokens = EN.tokenize("i said don't!")
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[4].sic == EN.lexicon['!']['sic']
|
assert tokens[4].sic == EN.lexicon['!']['sic']
|
||||||
|
@ -71,30 +71,39 @@ def test_cnts1():
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 8
|
assert len(tokens) == 8
|
||||||
|
|
||||||
|
|
||||||
def test_cnts2():
|
def test_cnts2():
|
||||||
text = u"""U.N. regulations are not a part of their concern."""
|
text = u"""U.N. regulations are not a part of their concern."""
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 10
|
assert len(tokens) == 10
|
||||||
|
|
||||||
|
|
||||||
def test_cnts3():
|
def test_cnts3():
|
||||||
text = u"“Isn't it?”"
|
text = u"“Isn't it?”"
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 6
|
words = [t.string for t in tokens]
|
||||||
|
assert len(words) == 6
|
||||||
|
|
||||||
|
|
||||||
def test_cnts4():
|
def test_cnts4():
|
||||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 15
|
words = [t.string for t in tokens]
|
||||||
|
assert len(words) == 15
|
||||||
|
|
||||||
|
|
||||||
def test_cnts5():
|
def test_cnts5():
|
||||||
text = """'Me too!', Mr. P. Delaware cried. """
|
text = """'Me too!', Mr. P. Delaware cried. """
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 11
|
assert len(tokens) == 11
|
||||||
|
|
||||||
|
|
||||||
def test_cnts6():
|
def test_cnts6():
|
||||||
text = u'They ran about 10km.'
|
text = u'They ran about 10km.'
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 6
|
words = [t.string for t in tokens]
|
||||||
|
assert len(words) == 6
|
||||||
|
|
||||||
|
|
||||||
#def test_cnts7():
|
#def test_cnts7():
|
||||||
# text = 'But then the 6,000-year ice age came...'
|
# text = 'But then the 6,000-year ice age came...'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user