mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge pull request #1442 from explosion/feature/fix-sp
💫Fix SP tag, tweak Vectors.__init__, fix Morphology
This commit is contained in:
commit
ef3e5a361b
|
@ -62,5 +62,5 @@ TAG_MAP = {
|
||||||
"VVIZU": {POS: VERB, "VerbForm": "inf"},
|
"VVIZU": {POS: VERB, "VerbForm": "inf"},
|
||||||
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
|
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
|
||||||
"XY": {POS: X},
|
"XY": {POS: X},
|
||||||
"SP": {POS: SPACE}
|
"_SP": {POS: SPACE}
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,11 +55,11 @@ TAG_MAP = {
|
||||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
"SP": {POS: SPACE},
|
|
||||||
"ADD": {POS: X},
|
"ADD": {POS: X},
|
||||||
"NFP": {POS: PUNCT},
|
"NFP": {POS: PUNCT},
|
||||||
"GW": {POS: X},
|
"GW": {POS: X},
|
||||||
"XX": {POS: X},
|
"XX": {POS: X},
|
||||||
"BES": {POS: VERB},
|
"BES": {POS: VERB},
|
||||||
"HVS": {POS: VERB}
|
"HVS": {POS: VERB},
|
||||||
|
"_SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -303,5 +303,5 @@ TAG_MAP = {
|
||||||
"VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
|
"VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
|
||||||
"VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
|
"VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
|
||||||
"X___": {"morph": "_", "pos": "X"},
|
"X___": {"morph": "_", "pos": "X"},
|
||||||
"SP": {"morph": "_", "pos": "SPACE"},
|
"_SP": {"morph": "_", "pos": "SPACE"},
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,63 +19,64 @@ TAG_MAP = {
|
||||||
"NPRP": {POS: PRON},
|
"NPRP": {POS: PRON},
|
||||||
# ADJ
|
# ADJ
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"NONM": {POS: ADJ},
|
"NONM": {POS: ADJ},
|
||||||
"VATT": {POS: ADJ},
|
"VATT": {POS: ADJ},
|
||||||
"DONM": {POS: ADJ},
|
"DONM": {POS: ADJ},
|
||||||
# ADV
|
# ADV
|
||||||
"ADV": {POS: ADV},
|
"ADV": {POS: ADV},
|
||||||
"ADVN": {POS: ADV},
|
"ADVN": {POS: ADV},
|
||||||
"ADVI": {POS: ADV},
|
"ADVI": {POS: ADV},
|
||||||
"ADVP": {POS: ADV},
|
"ADVP": {POS: ADV},
|
||||||
"ADVS": {POS: ADV},
|
"ADVS": {POS: ADV},
|
||||||
# INT
|
# INT
|
||||||
"INT": {POS: INTJ},
|
"INT": {POS: INTJ},
|
||||||
# PRON
|
# PRON
|
||||||
"PROPN": {POS: PROPN},
|
"PROPN": {POS: PROPN},
|
||||||
"PPRS": {POS: PROPN},
|
"PPRS": {POS: PROPN},
|
||||||
"PDMN": {POS: PROPN},
|
"PDMN": {POS: PROPN},
|
||||||
"PNTR": {POS: PROPN},
|
"PNTR": {POS: PROPN},
|
||||||
# DET
|
# DET
|
||||||
"DET": {POS: DET},
|
"DET": {POS: DET},
|
||||||
"DDAN": {POS: DET},
|
"DDAN": {POS: DET},
|
||||||
"DDAC": {POS: DET},
|
"DDAC": {POS: DET},
|
||||||
"DDBQ": {POS: DET},
|
"DDBQ": {POS: DET},
|
||||||
"DDAQ": {POS: DET},
|
"DDAQ": {POS: DET},
|
||||||
"DIAC": {POS: DET},
|
"DIAC": {POS: DET},
|
||||||
"DIBQ": {POS: DET},
|
"DIBQ": {POS: DET},
|
||||||
"DIAQ": {POS: DET},
|
"DIAQ": {POS: DET},
|
||||||
"DCNM": {POS: DET},
|
"DCNM": {POS: DET},
|
||||||
# NUM
|
# NUM
|
||||||
"NUM": {POS: NUM},
|
"NUM": {POS: NUM},
|
||||||
"NCNM": {POS: NUM},
|
"NCNM": {POS: NUM},
|
||||||
"NLBL": {POS: NUM},
|
"NLBL": {POS: NUM},
|
||||||
"DCNM": {POS: NUM},
|
"DCNM": {POS: NUM},
|
||||||
# AUX
|
# AUX
|
||||||
"AUX": {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
"XVBM": {POS: AUX},
|
"XVBM": {POS: AUX},
|
||||||
"XVAM": {POS: AUX},
|
"XVAM": {POS: AUX},
|
||||||
"XVMM": {POS: AUX},
|
"XVMM": {POS: AUX},
|
||||||
"XVBB": {POS: AUX},
|
"XVBB": {POS: AUX},
|
||||||
"XVAE": {POS: AUX},
|
"XVAE": {POS: AUX},
|
||||||
# ADP
|
# ADP
|
||||||
"ADP": {POS: ADP},
|
"ADP": {POS: ADP},
|
||||||
"RPRE": {POS: ADP},
|
"RPRE": {POS: ADP},
|
||||||
# CCONJ
|
# CCONJ
|
||||||
"CCONJ": {POS: CCONJ},
|
"CCONJ": {POS: CCONJ},
|
||||||
"JCRG": {POS: CCONJ},
|
"JCRG": {POS: CCONJ},
|
||||||
# SCONJ
|
# SCONJ
|
||||||
"SCONJ": {POS: SCONJ},
|
"SCONJ": {POS: SCONJ},
|
||||||
"PREL": {POS: SCONJ},
|
"PREL": {POS: SCONJ},
|
||||||
"JSBR": {POS: SCONJ},
|
"JSBR": {POS: SCONJ},
|
||||||
"JCMP": {POS: SCONJ},
|
"JCMP": {POS: SCONJ},
|
||||||
# PART
|
# PART
|
||||||
"PART": {POS: PART},
|
"PART": {POS: PART},
|
||||||
"FIXN": {POS: PART},
|
"FIXN": {POS: PART},
|
||||||
"FIXV": {POS: PART},
|
"FIXV": {POS: PART},
|
||||||
"EAFF": {POS: PART},
|
"EAFF": {POS: PART},
|
||||||
"AITT": {POS: PART},
|
"AITT": {POS: PART},
|
||||||
"NEG": {POS: PART},
|
"NEG": {POS: PART},
|
||||||
# PUNCT
|
# PUNCT
|
||||||
"PUNCT": {POS: PUNCT},
|
"PUNCT": {POS: PUNCT},
|
||||||
"PUNC": {POS: PUNCT}
|
"PUNC": {POS: PUNCT},
|
||||||
|
"_SP": {POS: SPACE}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ cdef class Morphology:
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
||||||
|
|
||||||
|
|
||||||
cpdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
Animacy_anim = symbols.Animacy_anim
|
Animacy_anim = symbols.Animacy_anim
|
||||||
Animacy_inam
|
Animacy_inam
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
|
@ -36,14 +36,22 @@ cdef class Morphology:
|
||||||
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
|
# Add special space symbol. We prefix with underscore, to make sure it
|
||||||
|
# always sorts to the end.
|
||||||
|
space_attrs = tag_map.pop('SP', {POS: SPACE})
|
||||||
|
if '_SP' not in tag_map:
|
||||||
|
self.strings.add('_SP')
|
||||||
|
tag_map = dict(tag_map)
|
||||||
|
tag_map['_SP'] = space_attrs
|
||||||
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||||
self.tag_map = {}
|
self.tag_map = {}
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
|
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
|
self.strings.add(tag_str)
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
|
@ -93,7 +101,7 @@ cdef class Morphology:
|
||||||
# the statistical model fails.
|
# the statistical model fails.
|
||||||
# Related to Issue #220
|
# Related to Issue #220
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings.add('SP')]
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
|
@ -426,3 +434,7 @@ IDS = {
|
||||||
|
|
||||||
|
|
||||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
|
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||||
|
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||||
|
# We keep the enum cdef, and just make sure the names are available to Python
|
||||||
|
locals().update(IDS)
|
||||||
|
|
|
@ -35,18 +35,18 @@ def vocab(en_vocab, vectors):
|
||||||
|
|
||||||
|
|
||||||
def test_init_vectors_with_data(strings, data):
|
def test_init_vectors_with_data(strings, data):
|
||||||
v = Vectors(strings, data)
|
v = Vectors(strings, data=data)
|
||||||
assert v.shape == data.shape
|
assert v.shape == data.shape
|
||||||
|
|
||||||
def test_init_vectors_with_width(strings):
|
def test_init_vectors_with_width(strings):
|
||||||
v = Vectors(strings, 3)
|
v = Vectors(strings, width=3)
|
||||||
for string in strings:
|
for string in strings:
|
||||||
v.add(string)
|
v.add(string)
|
||||||
assert v.shape == (len(strings), 3)
|
assert v.shape == (len(strings), 3)
|
||||||
|
|
||||||
|
|
||||||
def test_get_vector(strings, data):
|
def test_get_vector(strings, data):
|
||||||
v = Vectors(strings, data)
|
v = Vectors(strings, data=data)
|
||||||
for string in strings:
|
for string in strings:
|
||||||
v.add(string)
|
v.add(string)
|
||||||
assert list(v[strings[0]]) == list(data[0])
|
assert list(v[strings[0]]) == list(data[0])
|
||||||
|
@ -56,7 +56,7 @@ def test_get_vector(strings, data):
|
||||||
|
|
||||||
def test_set_vector(strings, data):
|
def test_set_vector(strings, data):
|
||||||
orig = data.copy()
|
orig = data.copy()
|
||||||
v = Vectors(strings, data)
|
v = Vectors(strings, data=data)
|
||||||
for string in strings:
|
for string in strings:
|
||||||
v.add(string)
|
v.add(string)
|
||||||
assert list(v[strings[0]]) == list(orig[0])
|
assert list(v[strings[0]]) == list(orig[0])
|
||||||
|
|
|
@ -32,22 +32,24 @@ cdef class Vectors:
|
||||||
cdef public object keys
|
cdef public object keys
|
||||||
cdef public int i
|
cdef public int i
|
||||||
|
|
||||||
def __init__(self, strings, data_or_width=0):
|
def __init__(self, strings, width=0, data=None):
|
||||||
if isinstance(strings, StringStore):
|
if isinstance(strings, StringStore):
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
else:
|
else:
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.strings.add(string)
|
self.strings.add(string)
|
||||||
if isinstance(data_or_width, int):
|
if data is not None:
|
||||||
self.data = data = numpy.zeros((len(strings), data_or_width),
|
self.data = numpy.asarray(data, dtype='f')
|
||||||
dtype='f')
|
|
||||||
else:
|
else:
|
||||||
data = data_or_width
|
self.data = numpy.zeros((len(self.strings), width), dtype='f')
|
||||||
self.i = 0
|
self.i = 0
|
||||||
self.data = data
|
|
||||||
self.key2row = {}
|
self.key2row = {}
|
||||||
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
|
self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
|
||||||
|
for i, string in enumerate(self.strings):
|
||||||
|
if i >= self.data.shape[0]:
|
||||||
|
break
|
||||||
|
self.add(self.strings[string], self.data[i])
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Vectors, (self.strings, self.data))
|
return (Vectors, (self.strings, self.data))
|
||||||
|
|
|
@ -62,12 +62,9 @@ cdef class Vocab:
|
||||||
if strings:
|
if strings:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
_ = self[string]
|
_ = self[string]
|
||||||
for name in tag_map.keys():
|
|
||||||
if name:
|
|
||||||
self.strings.add(name)
|
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
self.vectors = Vectors(self.strings)
|
self.vectors = Vectors(self.strings, width=0)
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -255,7 +252,7 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if new_dim is None:
|
if new_dim is None:
|
||||||
new_dim = self.vectors.data.shape[1]
|
new_dim = self.vectors.data.shape[1]
|
||||||
self.vectors = Vectors(self.strings, new_dim)
|
self.vectors = Vectors(self.strings, width=new_dim)
|
||||||
|
|
||||||
def get_vector(self, orth):
|
def get_vector(self, orth):
|
||||||
"""Retrieve a vector for a word in the vocabulary.
|
"""Retrieve a vector for a word in the vocabulary.
|
||||||
|
@ -338,7 +335,7 @@ cdef class Vocab:
|
||||||
if self.vectors is None:
|
if self.vectors is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return self.vectors.to_bytes(exclude='strings.json')
|
return self.vectors.to_bytes()
|
||||||
|
|
||||||
getters = OrderedDict((
|
getters = OrderedDict((
|
||||||
('strings', lambda: self.strings.to_bytes()),
|
('strings', lambda: self.strings.to_bytes()),
|
||||||
|
@ -358,7 +355,7 @@ cdef class Vocab:
|
||||||
if self.vectors is None:
|
if self.vectors is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return self.vectors.from_bytes(b, exclude='strings')
|
return self.vectors.from_bytes(b)
|
||||||
setters = OrderedDict((
|
setters = OrderedDict((
|
||||||
('strings', lambda b: self.strings.from_bytes(b)),
|
('strings', lambda b: self.strings.from_bytes(b)),
|
||||||
('lexemes', lambda b: self.lexemes_from_bytes(b)),
|
('lexemes', lambda b: self.lexemes_from_bytes(b)),
|
||||||
|
|
|
@ -12,7 +12,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| Create a new vector store. To keep the vector table empty, pass
|
| Create a new vector store. To keep the vector table empty, pass
|
||||||
| #[code data_or_width=0]. You can also create the vector table and add
|
| #[code width=0]. You can also create the vector table and add
|
||||||
| vectors one by one, or set the vector values directly on initialisation.
|
| vectors one by one, or set the vector values directly on initialisation.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
|
@ -21,11 +21,11 @@ p
|
||||||
|
|
||||||
empty_vectors = Vectors(StringStore())
|
empty_vectors = Vectors(StringStore())
|
||||||
|
|
||||||
vectors = Vectors([u'cat'], 300)
|
vectors = Vectors([u'cat'], width=300)
|
||||||
vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
|
vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
|
||||||
|
|
||||||
vector_table = numpy.zeros((3, 300), dtype='f')
|
vector_table = numpy.zeros((3, 300), dtype='f')
|
||||||
vectors = Vectors(StringStore(), vector_table)
|
vectors = Vectors(StringStore(), data=vector_table)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -36,9 +36,12 @@ p
|
||||||
| that maps strings to hash values, and vice versa.
|
| that maps strings to hash values, and vice versa.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code data_or_width]
|
+cell #[code data]
|
||||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell Vector data or number of dimensions.
|
|
||||||
|
+row
|
||||||
|
+cell #[code width]
|
||||||
|
+cell Number of dimensions.
|
||||||
|
|
||||||
+row("foot")
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
|
|
Loading…
Reference in New Issue
Block a user