2018-07-25 00:38:44 +03:00
|
|
|
# coding: utf-8
|
2017-10-11 04:21:23 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2018-07-25 00:38:44 +03:00
|
|
|
import pytest
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
from spacy.tokens import Doc
|
|
|
|
from spacy.lemmatizer import Lemmatizer
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 18:26:11 +03:00
|
|
|
from spacy.lookups import Table
|
2017-10-11 04:21:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def lemmatizer():
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 18:26:11 +03:00
|
|
|
lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
|
|
|
return Lemmatizer(lookup=lookup)
|
2017-10-11 04:21:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def vocab(lemmatizer):
|
|
|
|
return Vocab(lemmatizer=lemmatizer)
|
|
|
|
|
|
|
|
|
|
|
|
def test_empty_doc(vocab):
|
|
|
|
doc = Doc(vocab)
|
|
|
|
assert len(doc) == 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_single_word(vocab):
|
2018-11-27 03:09:36 +03:00
|
|
|
doc = Doc(vocab, words=["a"])
|
|
|
|
assert doc.text == "a "
|
|
|
|
doc = Doc(vocab, words=["a"], spaces=[False])
|
|
|
|
assert doc.text == "a"
|
2017-10-11 04:21:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_lookup_lemmatization(vocab):
|
2018-11-27 03:09:36 +03:00
|
|
|
doc = Doc(vocab, words=["dogs", "dogses"])
|
|
|
|
assert doc[0].text == "dogs"
|
|
|
|
assert doc[0].lemma_ == "dog"
|
|
|
|
assert doc[1].text == "dogses"
|
|
|
|
assert doc[1].lemma_ == "dogses"
|