mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Allow Morphology class to setup tokens
Add Morphology.assign_untagged() C-method, and call it from Doc.push_back() when a token is created. This gives a place to allow the Morphology class to initialize token data.
This commit is contained in:
commit
a6ac4699eb
|
@ -24,6 +24,8 @@ class Lemmatizer(object):
|
|||
univ_pos = 'adj'
|
||||
elif univ_pos == PUNCT:
|
||||
univ_pos = 'punct'
|
||||
else:
|
||||
return set([string.lower()])
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return set([string.lower()])
|
||||
|
|
|
@ -35,6 +35,8 @@ cdef class Morphology:
|
|||
cdef RichTagC* rich_tags
|
||||
cdef PreshMapArray _cache
|
||||
|
||||
cdef int assign_untagged(self, TokenC* token) except -1
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||
|
|
|
@ -42,7 +42,7 @@ cdef class Morphology:
|
|||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
|
@ -52,6 +52,10 @@ cdef class Morphology:
|
|||
self.rich_tags[i].morph = 0
|
||||
self.rich_tags[i].pos = attrs[POS]
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
# Add a 'null' tag, which we can reference when assign morphology to
|
||||
# untagged tokens.
|
||||
self.rich_tags[self.n_tags].id = self.n_tags
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
|
@ -62,6 +66,10 @@ cdef class Morphology:
|
|||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||
self.exc), None, None)
|
||||
|
||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||
'''Set morphological attributes on a token without a POS tag.'''
|
||||
token.lemma = self.lemmatize(0, token.lex.orth, {})
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
if isinstance(tag, basestring):
|
||||
tag = self.strings.add(tag)
|
||||
|
@ -72,7 +80,7 @@ cdef class Morphology:
|
|||
token.tag = tag
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id >= self.n_tags:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||
# is that this is where the specific word and the tag interact. Still,
|
||||
|
@ -151,8 +159,6 @@ cdef class Morphology:
|
|||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings.add(py_string.lower())
|
||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||
return self.strings.add(py_string.lower())
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||
|
|
37
spacy/tests/doc/test_creation.py
Normal file
37
spacy/tests/doc/test_creation.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
'''Test Doc sets up tokens correctly.'''
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from ...vocab import Vocab
|
||||
from ...tokens.doc import Doc
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab(lemmatizer):
|
||||
return Vocab(lemmatizer=lemmatizer)
|
||||
|
||||
|
||||
def test_empty_doc(vocab):
|
||||
doc = Doc(vocab)
|
||||
assert len(doc) == 0
|
||||
|
||||
|
||||
def test_single_word(vocab):
|
||||
doc = Doc(vocab, words=['a'])
|
||||
assert doc.text == 'a '
|
||||
doc = Doc(vocab, words=['a'], spaces=[False])
|
||||
assert doc.text == 'a'
|
||||
|
||||
|
||||
def test_lookup_lemmatization(vocab):
|
||||
doc = Doc(vocab, words=['dogs', 'dogses'])
|
||||
assert doc[0].text == 'dogs'
|
||||
assert doc[0].lemma_ == 'dog'
|
||||
assert doc[1].text == 'dogses'
|
||||
assert doc[1].lemma_ == 'dogses'
|
|
@ -512,6 +512,8 @@ cdef class Doc:
|
|||
assert t.lex.orth != 0
|
||||
t.spacy = has_space
|
||||
self.length += 1
|
||||
# Set morphological attributes, e.g. by lemma, if possible
|
||||
self.vocab.morphology.assign_untagged(t)
|
||||
self._py_tokens.append(None)
|
||||
return t.idx + t.lex.length + t.spacy
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user