mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Rename sic to orth
This commit is contained in:
parent
93d4bd6c2e
commit
5ed8b2b98f
|
@ -19,7 +19,7 @@ def get_lex_props(string):
|
||||||
return {
|
return {
|
||||||
'flags': get_flags(string),
|
'flags': get_flags(string),
|
||||||
'length': len(string),
|
'length': len(string),
|
||||||
'sic': string,
|
'orth': string,
|
||||||
'norm1': string.lower(),
|
'norm1': string.lower(),
|
||||||
'norm2': string,
|
'norm2': string,
|
||||||
'shape': orth.word_shape(string),
|
'shape': orth.word_shape(string),
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||||
from ..attrs cimport FLAG8, FLAG9, FLAG10
|
from ..attrs cimport FLAG8, FLAG9, FLAG10
|
||||||
from ..attrs cimport SIC as _SIC
|
from ..attrs cimport ORTH as _ORTH
|
||||||
from ..attrs cimport SHAPE as _SHAPE
|
from ..attrs cimport SHAPE as _SHAPE
|
||||||
from ..attrs cimport NORM1 as _NORM1
|
from ..attrs cimport NORM1 as _NORM1
|
||||||
from ..attrs cimport NORM2 as _NORM2
|
from ..attrs cimport NORM2 as _NORM2
|
||||||
|
@ -24,7 +24,7 @@ cpdef enum:
|
||||||
LIKE_NUM = FLAG9
|
LIKE_NUM = FLAG9
|
||||||
IS_STOP = FLAG10
|
IS_STOP = FLAG10
|
||||||
|
|
||||||
SIC = _SIC
|
ORTH = _ORTH
|
||||||
SHAPE = _SHAPE
|
SHAPE = _SHAPE
|
||||||
LOWER = _NORM1
|
LOWER = _NORM1
|
||||||
NORM2 = _NORM2
|
NORM2 = _NORM2
|
||||||
|
|
|
@ -70,7 +70,7 @@ cpdef enum misc_t:
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
P2_sic
|
P2_orth
|
||||||
P2_cluster
|
P2_cluster
|
||||||
P2_shape
|
P2_shape
|
||||||
P2_prefix
|
P2_prefix
|
||||||
|
@ -78,7 +78,7 @@ cpdef enum:
|
||||||
P2_pos
|
P2_pos
|
||||||
P2_lemma
|
P2_lemma
|
||||||
|
|
||||||
P1_sic
|
P1_orth
|
||||||
P1_cluster
|
P1_cluster
|
||||||
P1_shape
|
P1_shape
|
||||||
P1_prefix
|
P1_prefix
|
||||||
|
@ -86,7 +86,7 @@ cpdef enum:
|
||||||
P1_pos
|
P1_pos
|
||||||
P1_lemma
|
P1_lemma
|
||||||
|
|
||||||
W_sic
|
W_orth
|
||||||
W_cluster
|
W_cluster
|
||||||
W_shape
|
W_shape
|
||||||
W_prefix
|
W_prefix
|
||||||
|
@ -94,7 +94,7 @@ cpdef enum:
|
||||||
W_pos
|
W_pos
|
||||||
W_lemma
|
W_lemma
|
||||||
|
|
||||||
N1_sic
|
N1_orth
|
||||||
N1_cluster
|
N1_cluster
|
||||||
N1_shape
|
N1_shape
|
||||||
N1_prefix
|
N1_prefix
|
||||||
|
@ -102,7 +102,7 @@ cpdef enum:
|
||||||
N1_pos
|
N1_pos
|
||||||
N1_lemma
|
N1_lemma
|
||||||
|
|
||||||
N2_sic
|
N2_orth
|
||||||
N2_cluster
|
N2_cluster
|
||||||
N2_shape
|
N2_shape
|
||||||
N2_prefix
|
N2_prefix
|
||||||
|
@ -169,11 +169,11 @@ POS_TAGS = {
|
||||||
|
|
||||||
|
|
||||||
POS_TEMPLATES = (
|
POS_TEMPLATES = (
|
||||||
(W_sic,),
|
(W_orth,),
|
||||||
(P1_lemma, P1_pos),
|
(P1_lemma, P1_pos),
|
||||||
(P2_lemma, P2_pos),
|
(P2_lemma, P2_pos),
|
||||||
(N1_sic,),
|
(N1_orth,),
|
||||||
(N2_sic,),
|
(N2_orth,),
|
||||||
|
|
||||||
(W_suffix,),
|
(W_suffix,),
|
||||||
(W_prefix,),
|
(W_prefix,),
|
||||||
|
@ -181,7 +181,7 @@ POS_TEMPLATES = (
|
||||||
(P1_pos,),
|
(P1_pos,),
|
||||||
(P2_pos,),
|
(P2_pos,),
|
||||||
(P1_pos, P2_pos),
|
(P1_pos, P2_pos),
|
||||||
(P1_pos, W_sic),
|
(P1_pos, W_orth),
|
||||||
(P1_suffix,),
|
(P1_suffix,),
|
||||||
(N1_suffix,),
|
(N1_suffix,),
|
||||||
|
|
||||||
|
@ -272,21 +272,21 @@ cdef class EnPosTagger:
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||||
cdef const PosTag* tag = &self.tags[tokens[i].tag]
|
cdef const PosTag* tag = &self.tags[tokens[i].tag]
|
||||||
tokens[i].pos = tag.pos
|
tokens[i].pos = tag.pos
|
||||||
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
|
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
|
||||||
if cached is NULL:
|
if cached is NULL:
|
||||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||||
cached.morph = tag.morph
|
cached.morph = tag.morph
|
||||||
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
|
||||||
tokens[i].lemma = cached.lemma
|
tokens[i].lemma = cached.lemma
|
||||||
tokens[i].morph = cached.morph
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return lex.sic
|
return lex.orth
|
||||||
cdef unicode py_string = self.strings[lex.sic]
|
cdef unicode py_string = self.strings[lex.orth]
|
||||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
return lex.sic
|
return lex.orth
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string, pos)
|
lemma_strings = self.lemmatizer(py_string, pos)
|
||||||
|
@ -301,29 +301,29 @@ cdef class EnPosTagger:
|
||||||
cdef dict entries
|
cdef dict entries
|
||||||
cdef dict props
|
cdef dict props
|
||||||
cdef int lemma
|
cdef int lemma
|
||||||
cdef id_t sic
|
cdef id_t orth
|
||||||
cdef int pos
|
cdef int pos
|
||||||
for pos_str, entries in exc.items():
|
for pos_str, entries in exc.items():
|
||||||
pos = self.tag_names.index(pos_str)
|
pos = self.tag_names.index(pos_str)
|
||||||
for form_str, props in entries.items():
|
for form_str, props in entries.items():
|
||||||
lemma_str = props.get('L', form_str)
|
lemma_str = props.get('L', form_str)
|
||||||
sic = self.strings[form_str]
|
orth = self.strings[form_str]
|
||||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||||
cached.lemma = self.strings[lemma_str]
|
cached.lemma = self.strings[lemma_str]
|
||||||
set_morph_from_dict(&cached.morph, props)
|
set_morph_from_dict(&cached.morph, props)
|
||||||
self._morph_cache.set(pos, sic, <void*>cached)
|
self._morph_cache.set(pos, orth, <void*>cached)
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
||||||
_fill_from_token(&context[W_sic], &tokens[i])
|
_fill_from_token(&context[W_orth], &tokens[i])
|
||||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
||||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
context[0] = t.lex.sic
|
context[0] = t.lex.orth
|
||||||
context[1] = t.lex.cluster
|
context[1] = t.lex.cluster
|
||||||
context[2] = t.lex.shape
|
context[2] = t.lex.shape
|
||||||
context[3] = t.lex.prefix
|
context[3] = t.lex.prefix
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
@ -14,20 +14,20 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
||||||
const float* empty_vec) except -1
|
const float* empty_vec) except -1
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
cdef readonly ndarray vec
|
cdef readonly ndarray repvec
|
||||||
|
|
||||||
cdef readonly flags_t flags
|
cdef readonly flags_t flags
|
||||||
cdef readonly attr_t id
|
cdef readonly attr_t id
|
||||||
cdef readonly attr_t length
|
cdef readonly attr_t length
|
||||||
|
|
||||||
cdef readonly attr_t sic
|
cdef readonly attr_t orth
|
||||||
cdef readonly attr_t norm1
|
cdef readonly attr_t norm1
|
||||||
cdef readonly attr_t norm2
|
cdef readonly attr_t norm2
|
||||||
cdef readonly attr_t shape
|
cdef readonly attr_t shape
|
||||||
cdef readonly attr_t prefix
|
cdef readonly attr_t prefix
|
||||||
cdef readonly attr_t suffix
|
cdef readonly attr_t suffix
|
||||||
|
|
||||||
cdef readonly unicode sic_
|
cdef readonly unicode orth_
|
||||||
cdef readonly unicode norm1_
|
cdef readonly unicode norm1_
|
||||||
cdef readonly unicode norm2_
|
cdef readonly unicode norm2_
|
||||||
cdef readonly unicode shape_
|
cdef readonly unicode shape_
|
||||||
|
@ -49,14 +49,14 @@ cdef class Lexeme:
|
||||||
py.id = ptr.id
|
py.id = ptr.id
|
||||||
py.length = ptr.length
|
py.length = ptr.length
|
||||||
|
|
||||||
py.sic = ptr.sic
|
py.orth = ptr.orth
|
||||||
py.norm1 = ptr.norm1
|
py.norm1 = ptr.norm1
|
||||||
py.norm2 = ptr.norm2
|
py.norm2 = ptr.norm2
|
||||||
py.shape = ptr.shape
|
py.shape = ptr.shape
|
||||||
py.prefix = ptr.prefix
|
py.prefix = ptr.prefix
|
||||||
py.suffix = ptr.suffix
|
py.suffix = ptr.suffix
|
||||||
|
|
||||||
py.sic_ = strings[ptr.sic]
|
py.orth_ = strings[ptr.orth]
|
||||||
py.norm1_ = strings[ptr.norm1]
|
py.norm1_ = strings[ptr.norm1]
|
||||||
py.norm2_ = strings[ptr.norm2]
|
py.norm2_ = strings[ptr.norm2]
|
||||||
py.shape_ = strings[ptr.shape]
|
py.shape_ = strings[ptr.shape]
|
||||||
|
@ -78,8 +78,8 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
return check_flag(lex, feat_name)
|
return check_flag(lex, feat_name)
|
||||||
elif feat_name == ID:
|
elif feat_name == ID:
|
||||||
return lex.id
|
return lex.id
|
||||||
elif feat_name == SIC:
|
elif feat_name == ORTH:
|
||||||
return lex.sic
|
return lex.orth
|
||||||
elif feat_name == NORM1:
|
elif feat_name == NORM1:
|
||||||
return lex.norm1
|
return lex.norm1
|
||||||
elif feat_name == NORM2:
|
elif feat_name == NORM2:
|
||||||
|
|
|
@ -16,7 +16,7 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
|
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
|
||||||
const float* empty_vec) except -1:
|
const float* empty_vec) except -1:
|
||||||
lex.length = props['length']
|
lex.length = props['length']
|
||||||
lex.sic = string_store[props['sic']]
|
lex.orth = string_store[props['orth']]
|
||||||
lex.norm1 = string_store[props['norm1']]
|
lex.norm1 = string_store[props['norm1']]
|
||||||
lex.norm2 = string_store[props['norm2']]
|
lex.norm2 = string_store[props['norm2']]
|
||||||
lex.shape = string_store[props['shape']]
|
lex.shape = string_store[props['shape']]
|
||||||
|
@ -34,4 +34,4 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""A dummy docstring"""
|
"""A dummy docstring"""
|
||||||
def __cinit__(self, int vec_size):
|
def __cinit__(self, int vec_size):
|
||||||
self.vec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
||||||
|
|
|
@ -11,7 +11,7 @@ cdef struct LexemeC:
|
||||||
attr_t id
|
attr_t id
|
||||||
attr_t length
|
attr_t length
|
||||||
|
|
||||||
attr_t sic
|
attr_t orth
|
||||||
attr_t norm1
|
attr_t norm1
|
||||||
attr_t norm2
|
attr_t norm2
|
||||||
attr_t shape
|
attr_t shape
|
||||||
|
|
|
@ -26,7 +26,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
context[5] = 0
|
context[5] = 0
|
||||||
context[6] = 0
|
context[6] = 0
|
||||||
else:
|
else:
|
||||||
context[0] = token.lex.sic
|
context[0] = token.lex.orth
|
||||||
context[1] = token.lemma
|
context[1] = token.lemma
|
||||||
context[2] = token.tag
|
context[2] = token.tag
|
||||||
context[3] = token.lex.cluster
|
context[3] = token.lex.cluster
|
||||||
|
|
|
@ -50,7 +50,7 @@ cdef class Token:
|
||||||
cdef readonly attr_t idx
|
cdef readonly attr_t idx
|
||||||
cdef readonly attr_t cluster
|
cdef readonly attr_t cluster
|
||||||
cdef readonly attr_t length
|
cdef readonly attr_t length
|
||||||
cdef readonly attr_t sic
|
cdef readonly attr_t orth
|
||||||
cdef readonly attr_t norm1
|
cdef readonly attr_t norm1
|
||||||
cdef readonly attr_t norm2
|
cdef readonly attr_t norm2
|
||||||
cdef readonly attr_t shape
|
cdef readonly attr_t shape
|
||||||
|
|
|
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
|
||||||
from .vocab cimport EMPTY_LEXEME
|
from .vocab cimport EMPTY_LEXEME
|
||||||
from .typedefs cimport attr_id_t, attr_t
|
from .typedefs cimport attr_id_t, attr_t
|
||||||
from .typedefs cimport LEMMA
|
from .typedefs cimport LEMMA
|
||||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from .typedefs cimport POS, LEMMA
|
from .typedefs cimport POS, LEMMA
|
||||||
|
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
@ -42,8 +42,8 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
return check_flag(lex, feat_name)
|
return check_flag(lex, feat_name)
|
||||||
elif feat_name == ID:
|
elif feat_name == ID:
|
||||||
return lex.id
|
return lex.id
|
||||||
elif feat_name == SIC:
|
elif feat_name == ORTH:
|
||||||
return lex.sic
|
return lex.orth
|
||||||
elif feat_name == NORM1:
|
elif feat_name == NORM1:
|
||||||
return lex.norm1
|
return lex.norm1
|
||||||
elif feat_name == NORM2:
|
elif feat_name == NORM2:
|
||||||
|
@ -97,8 +97,8 @@ cdef class Tokens:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if start is None:
|
if start is None:
|
||||||
start = i
|
start = i
|
||||||
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
|
if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
|
||||||
self.data[i].lex.sic == question:
|
self.data[i].lex.orth == question:
|
||||||
spans.append((start, i+1))
|
spans.append((start, i+1))
|
||||||
start = None
|
start = None
|
||||||
if start is not None:
|
if start is not None:
|
||||||
|
@ -176,9 +176,9 @@ cdef class Tokens:
|
||||||
>>> from spacy.en import English, attrs
|
>>> from spacy.en import English, attrs
|
||||||
>>> nlp = English()
|
>>> nlp = English()
|
||||||
>>> tokens = nlp(u'apple apple orange banana')
|
>>> tokens = nlp(u'apple apple orange banana')
|
||||||
>>> tokens.count_by(attrs.SIC)
|
>>> tokens.count_by(attrs.ORTH)
|
||||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||||
>>> tokens.to_array([attrs.SIC])
|
>>> tokens.to_array([attrs.ORTH])
|
||||||
array([[11880],
|
array([[11880],
|
||||||
[11880],
|
[11880],
|
||||||
[ 7561],
|
[ 7561],
|
||||||
|
@ -222,7 +222,7 @@ cdef class Token:
|
||||||
self.idx = t.idx
|
self.idx = t.idx
|
||||||
self.cluster = t.lex.cluster
|
self.cluster = t.lex.cluster
|
||||||
self.length = t.lex.length
|
self.length = t.lex.length
|
||||||
self.sic = t.lex.sic
|
self.orth = t.lex.orth
|
||||||
self.norm1 = t.lex.norm1
|
self.norm1 = t.lex.norm1
|
||||||
self.norm2 = t.lex.norm2
|
self.norm2 = t.lex.norm2
|
||||||
self.shape = t.lex.shape
|
self.shape = t.lex.shape
|
||||||
|
@ -270,14 +270,14 @@ cdef class Token:
|
||||||
"""The unicode string of the word, with no whitespace padding."""
|
"""The unicode string of the word, with no whitespace padding."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lex.sic == 0:
|
if t.lex.orth == 0:
|
||||||
return ''
|
return ''
|
||||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
|
||||||
return py_ustr
|
return py_ustr
|
||||||
|
|
||||||
property sic_:
|
property orth_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.sic]
|
return self._seq.vocab.strings[self.orth]
|
||||||
|
|
||||||
property norm1_:
|
property norm1_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -89,7 +89,7 @@ cpdef enum attr_id_t:
|
||||||
FLAG63
|
FLAG63
|
||||||
|
|
||||||
ID
|
ID
|
||||||
SIC
|
ORTH
|
||||||
NORM1
|
NORM1
|
||||||
NORM2
|
NORM2
|
||||||
SHAPE
|
SHAPE
|
||||||
|
|
|
@ -10,8 +10,8 @@ def EN():
|
||||||
|
|
||||||
def test_possess(EN):
|
def test_possess(EN):
|
||||||
tokens = EN("Mike's")
|
tokens = EN("Mike's")
|
||||||
assert EN.vocab.strings[tokens[0].sic] == b"Mike"
|
assert EN.vocab.strings[tokens[0].orth] == "Mike"
|
||||||
assert EN.vocab.strings[tokens[1].sic] == b"'s"
|
assert EN.vocab.strings[tokens[1].orth] == "'s"
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -33,17 +33,17 @@ def test_punct(EN):
|
||||||
def test_digits(EN):
|
def test_digits(EN):
|
||||||
tokens = EN('The year: 1984.')
|
tokens = EN('The year: 1984.')
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].sic == EN.vocab['The'].sic
|
assert tokens[0].orth == EN.vocab['The'].orth
|
||||||
assert tokens[3].sic == EN.vocab['1984'].sic
|
assert tokens[3].orth == EN.vocab['1984'].orth
|
||||||
|
|
||||||
|
|
||||||
def test_contraction(EN):
|
def test_contraction(EN):
|
||||||
tokens = EN("don't giggle")
|
tokens = EN("don't giggle")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[1].sic == EN.vocab["n't"].sic
|
assert tokens[1].orth == EN.vocab["n't"].orth
|
||||||
tokens = EN("i said don't!")
|
tokens = EN("i said don't!")
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[4].sic == EN.vocab['!'].sic
|
assert tokens[4].orth == EN.vocab['!'].orth
|
||||||
|
|
||||||
|
|
||||||
def test_contraction_punct(EN):
|
def test_contraction_punct(EN):
|
||||||
|
|
|
@ -11,24 +11,24 @@ def EN():
|
||||||
|
|
||||||
def test_neq(EN):
|
def test_neq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['bye'].sic != addr.sic
|
assert EN.vocab['bye'].orth != addr.orth
|
||||||
|
|
||||||
|
|
||||||
def test_eq(EN):
|
def test_eq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['Hello'].sic == addr.sic
|
assert EN.vocab['Hello'].orth == addr.orth
|
||||||
|
|
||||||
|
|
||||||
def test_case_neq(EN):
|
def test_case_neq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['hello'].sic != addr.sic
|
assert EN.vocab['hello'].orth != addr.orth
|
||||||
|
|
||||||
|
|
||||||
def test_punct_neq(EN):
|
def test_punct_neq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['Hello,'].sic != addr.sic
|
assert EN.vocab['Hello,'].orth != addr.orth
|
||||||
|
|
||||||
|
|
||||||
def test_shape_attr(EN):
|
def test_shape_attr(EN):
|
||||||
example = EN.vocab['example']
|
example = EN.vocab['example']
|
||||||
assert example.sic != example.shape
|
assert example.orth != example.shape
|
||||||
|
|
Loading…
Reference in New Issue
Block a user