Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-04 15:53:25 -05:00
commit 55d0621532
5 changed files with 53 additions and 33 deletions

View File

@ -9,7 +9,8 @@ LIST_ICONS = [r'[\p{So}--[°]]']
_currency = r'\$|¢|£|€|¥|฿' _currency = r'\$|¢|£|€|¥|฿'
_quotes = QUOTES.replace("'", '') _quotes = QUOTES.replace("'", '')
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) _prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'[,.:](?=[{a}])'.format(a=ALPHA)])
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
@ -21,7 +22,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
_infixes = (LIST_ELLIPSES + LIST_ICONS + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

View File

@ -22,48 +22,48 @@ _models = {'en': ['en_core_web_sm', 'en_depent_web_sm', 'en_core_web_md'],
# only used for tests that require loading the models # only used for tests that require loading the models
# in all other cases, use specific instances # in all other cases, use specific instances
@pytest.fixture(params=_models['en'], scope="session") @pytest.fixture(params=_models['en'], scope='session')
def EN(request): def EN(request):
return load_test_model(request.param) return load_test_model(request.param)
@pytest.fixture(params=_models['de'], scope="session") @pytest.fixture(params=_models['de'], scope='session')
def DE(request): def DE(request):
return load_test_model(request.param) return load_test_model(request.param)
@pytest.fixture(params=_models['fr'], scope="session") @pytest.fixture(params=_models['fr'], scope='session')
def FR(request): def FR(request):
return load_test_model(request.param) return load_test_model(request.param)
@pytest.fixture(params=_languages) @pytest.fixture(params=_languages, scope='session')
def tokenizer(request): def tokenizer(request):
lang = util.get_lang_class(request.param) lang = util.get_lang_class(request.param)
return lang.Defaults.create_tokenizer() return lang.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def en_tokenizer(): def en_tokenizer():
return util.get_lang_class('en').Defaults.create_tokenizer() return util.get_lang_class('en').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def en_vocab(): def en_vocab():
return util.get_lang_class('en').Defaults.create_vocab() return util.get_lang_class('en').Defaults.create_vocab()
@pytest.fixture @pytest.fixture(scope='module')
def en_parser(): def en_parser():
return util.get_lang_class('en').Defaults.create_parser() return util.get_lang_class('en').Defaults.create_parser()
@pytest.fixture @pytest.fixture(scope='module')
def es_tokenizer(): def es_tokenizer():
return util.get_lang_class('es').Defaults.create_tokenizer() return util.get_lang_class('es').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def de_tokenizer(): def de_tokenizer():
return util.get_lang_class('de').Defaults.create_tokenizer() return util.get_lang_class('de').Defaults.create_tokenizer()
@ -73,31 +73,31 @@ def fr_tokenizer():
return util.get_lang_class('fr').Defaults.create_tokenizer() return util.get_lang_class('fr').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def hu_tokenizer(): def hu_tokenizer():
return util.get_lang_class('hu').Defaults.create_tokenizer() return util.get_lang_class('hu').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def fi_tokenizer(): def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer() return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def sv_tokenizer(): def sv_tokenizer():
return util.get_lang_class('sv').Defaults.create_tokenizer() return util.get_lang_class('sv').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def bn_tokenizer(): def bn_tokenizer():
return util.get_lang_class('bn').Defaults.create_tokenizer() return util.get_lang_class('bn').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def he_tokenizer(): def he_tokenizer():
return util.get_lang_class('he').Defaults.create_tokenizer() return util.get_lang_class('he').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture(scope='module')
def nb_tokenizer(): def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer() return util.get_lang_class('nb').Defaults.create_tokenizer()
@ -107,7 +107,7 @@ def stringstore():
return StringStore() return StringStore()
@pytest.fixture @pytest.fixture(scope='module')
def en_entityrecognizer(): def en_entityrecognizer():
return util.get_lang_class('en').Defaults.create_entity() return util.get_lang_class('en').Defaults.create_entity()

View File

@ -5,11 +5,11 @@ import pytest
DEFAULT_TESTS = [ DEFAULT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail),
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
('A .hu.', ['A', '.hu', '.']), pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail),
('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
('A pl.', ['A', 'pl.']), ('A pl.', ['A', 'pl.']),
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
@ -18,7 +18,9 @@ DEFAULT_TESTS = [
('Valami ...van...', ['Valami', '...', 'van', '...']), ('Valami ...van...', ['Valami', '...', 'van', '...']),
('Valami...', ['Valami', '...']), ('Valami...', ['Valami', '...']),
('Valami ...', ['Valami', '...']), ('Valami ...', ['Valami', '...']),
('Valami ... más.', ['Valami', '...', 'más', '.']) ('Valami ... más.', ['Valami', '...', 'más', '.']),
('Soha nem lesz!', ['Soha', 'nem', 'lesz', '!']),
('Soha nem lesz?', ['Soha', 'nem', 'lesz', '?'])
] ]
HYPHEN_TESTS = [ HYPHEN_TESTS = [
@ -225,11 +227,11 @@ QUOTE_TESTS = [
DOT_TESTS = [ DOT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail),
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']), ('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']), ('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
('A .hu.', ['A', '.hu', '.']), pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail),
('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
('A pl.', ['A', 'pl.']), ('A pl.', ['A', 'pl.']),
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
@ -241,6 +243,24 @@ DOT_TESTS = [
('Valami ... más.', ['Valami', '...', 'más', '.']) ('Valami ... más.', ['Valami', '...', 'más', '.'])
] ]
TYPO_TESTS = [
(
'Ez egy mondat vége.Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
('Ez egy mondat vége .Ez egy másik eleje.',
['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
(
'Ez egy mondat vége!ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
('Ez egy mondat vége !ez egy másik eleje.',
['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
(
'Ez egy mondat vége?Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
('Ez egy mondat vége ?Ez egy másik eleje.',
['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
('egy,kettő', ['egy', ',', 'kettő']),
('egy ,kettő', ['egy', ',', 'kettő']),
('egy :kettő', ['egy', ':', 'kettő']),
]
WIKI_TESTS = [ WIKI_TESTS = [
('!"', ['!', '"']), ('!"', ['!', '"']),
('lány"a', ['lány', '"', 'a']), ('lány"a', ['lány', '"', 'a']),
@ -253,7 +273,7 @@ WIKI_TESTS = [
('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid']) ('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid'])
] ]
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES) @pytest.mark.parametrize('text,expected_tokens', TESTCASES)

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import json import json
import os
import random import random
import contextlib import contextlib
import shutil import shutil
@ -9,7 +8,6 @@ import tempfile
from pathlib import Path from pathlib import Path
import pathlib
from ...gold import GoldParse from ...gold import GoldParse
from ...pipeline import EntityRecognizer from ...pipeline import EntityRecognizer
from ...lang.en import English from ...lang.en import English
@ -79,7 +77,8 @@ def test_issue910(EN, train_data, additional_entity_types):
2) There's no way to set the learning rate for the weight update, so we 2) There's no way to set the learning rate for the weight update, so we
end up out-of-scale, causing it to learn too fast. end up out-of-scale, causing it to learn too fast.
''' '''
doc = EN(u"I am looking for a restaurant in Berlin") nlp = EN
doc = nlp(u"I am looking for a restaurant in Berlin")
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents] ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
# Fine tune the ner model # Fine tune the ner model
for entity_type in additional_entity_types: for entity_type in additional_entity_types:

View File

@ -22,12 +22,12 @@ main.o-main.o-main--sidebar.o-main--aside
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs") +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
strong This page is part of the alpha documentation for spaCy v2.0. strong This page is part of the alpha documentation for spaCy v2.0.
| It does not reflect the state of the latest stable release. | It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the actual | Because v2.0 is still under development, the implementation
| implementation may differ from the intended state described | may differ from the intended state described here. See the
| here. | #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| #[+a("#") See here] for more information on how to install | for details on how to install and test the new version. To
| and test the new version. To read the official docs for | read the official docs for spaCy v1.x,
| v1.x, #[+a("https://spacy.io/docs") go here]. | #[+a("https://spacy.io/docs") go here].
!=yield !=yield