mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
55d0621532
|
@ -9,7 +9,8 @@ LIST_ICONS = [r'[\p{So}--[°]]']
|
||||||
_currency = r'\$|¢|£|€|¥|฿'
|
_currency = r'\$|¢|£|€|¥|฿'
|
||||||
_quotes = QUOTES.replace("'", '')
|
_quotes = QUOTES.replace("'", '')
|
||||||
|
|
||||||
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
[r'[,.:](?=[{a}])'.format(a=ALPHA)])
|
||||||
|
|
||||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
[r'(?<=[0-9])\+',
|
[r'(?<=[0-9])\+',
|
||||||
|
@ -21,7 +22,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
|
|
@ -22,48 +22,48 @@ _models = {'en': ['en_core_web_sm', 'en_depent_web_sm', 'en_core_web_md'],
|
||||||
# only used for tests that require loading the models
|
# only used for tests that require loading the models
|
||||||
# in all other cases, use specific instances
|
# in all other cases, use specific instances
|
||||||
|
|
||||||
@pytest.fixture(params=_models['en'], scope="session")
|
@pytest.fixture(params=_models['en'], scope='session')
|
||||||
def EN(request):
|
def EN(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_models['de'], scope="session")
|
@pytest.fixture(params=_models['de'], scope='session')
|
||||||
def DE(request):
|
def DE(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_models['fr'], scope="session")
|
@pytest.fixture(params=_models['fr'], scope='session')
|
||||||
def FR(request):
|
def FR(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_languages)
|
@pytest.fixture(params=_languages, scope='session')
|
||||||
def tokenizer(request):
|
def tokenizer(request):
|
||||||
lang = util.get_lang_class(request.param)
|
lang = util.get_lang_class(request.param)
|
||||||
return lang.Defaults.create_tokenizer()
|
return lang.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def en_tokenizer():
|
def en_tokenizer():
|
||||||
return util.get_lang_class('en').Defaults.create_tokenizer()
|
return util.get_lang_class('en').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def en_vocab():
|
def en_vocab():
|
||||||
return util.get_lang_class('en').Defaults.create_vocab()
|
return util.get_lang_class('en').Defaults.create_vocab()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def en_parser():
|
def en_parser():
|
||||||
return util.get_lang_class('en').Defaults.create_parser()
|
return util.get_lang_class('en').Defaults.create_parser()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def es_tokenizer():
|
def es_tokenizer():
|
||||||
return util.get_lang_class('es').Defaults.create_tokenizer()
|
return util.get_lang_class('es').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def de_tokenizer():
|
def de_tokenizer():
|
||||||
return util.get_lang_class('de').Defaults.create_tokenizer()
|
return util.get_lang_class('de').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@ -73,31 +73,31 @@ def fr_tokenizer():
|
||||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def hu_tokenizer():
|
def hu_tokenizer():
|
||||||
return util.get_lang_class('hu').Defaults.create_tokenizer()
|
return util.get_lang_class('hu').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def fi_tokenizer():
|
def fi_tokenizer():
|
||||||
return util.get_lang_class('fi').Defaults.create_tokenizer()
|
return util.get_lang_class('fi').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return util.get_lang_class('sv').Defaults.create_tokenizer()
|
return util.get_lang_class('sv').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def bn_tokenizer():
|
def bn_tokenizer():
|
||||||
return util.get_lang_class('bn').Defaults.create_tokenizer()
|
return util.get_lang_class('bn').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def he_tokenizer():
|
def he_tokenizer():
|
||||||
return util.get_lang_class('he').Defaults.create_tokenizer()
|
return util.get_lang_class('he').Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ def stringstore():
|
||||||
return StringStore()
|
return StringStore()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope='module')
|
||||||
def en_entityrecognizer():
|
def en_entityrecognizer():
|
||||||
return util.get_lang_class('en').Defaults.create_entity()
|
return util.get_lang_class('en').Defaults.create_entity()
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,11 @@ import pytest
|
||||||
|
|
||||||
DEFAULT_TESTS = [
|
DEFAULT_TESTS = [
|
||||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||||
('A .hu.', ['A', '.hu', '.']),
|
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
@ -18,7 +18,9 @@ DEFAULT_TESTS = [
|
||||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||||
('Valami...', ['Valami', '...']),
|
('Valami...', ['Valami', '...']),
|
||||||
('Valami ...', ['Valami', '...']),
|
('Valami ...', ['Valami', '...']),
|
||||||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
('Valami ... más.', ['Valami', '...', 'más', '.']),
|
||||||
|
('Soha nem lesz!', ['Soha', 'nem', 'lesz', '!']),
|
||||||
|
('Soha nem lesz?', ['Soha', 'nem', 'lesz', '?'])
|
||||||
]
|
]
|
||||||
|
|
||||||
HYPHEN_TESTS = [
|
HYPHEN_TESTS = [
|
||||||
|
@ -225,11 +227,11 @@ QUOTE_TESTS = [
|
||||||
|
|
||||||
DOT_TESTS = [
|
DOT_TESTS = [
|
||||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
||||||
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
||||||
('A .hu.', ['A', '.hu', '.']),
|
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
@ -241,6 +243,24 @@ DOT_TESTS = [
|
||||||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
TYPO_TESTS = [
|
||||||
|
(
|
||||||
|
'Ez egy mondat vége.Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('Ez egy mondat vége .Ez egy másik eleje.',
|
||||||
|
['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
(
|
||||||
|
'Ez egy mondat vége!ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('Ez egy mondat vége !ez egy másik eleje.',
|
||||||
|
['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
(
|
||||||
|
'Ez egy mondat vége?Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('Ez egy mondat vége ?Ez egy másik eleje.',
|
||||||
|
['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('egy,kettő', ['egy', ',', 'kettő']),
|
||||||
|
('egy ,kettő', ['egy', ',', 'kettő']),
|
||||||
|
('egy :kettő', ['egy', ':', 'kettő']),
|
||||||
|
]
|
||||||
|
|
||||||
WIKI_TESTS = [
|
WIKI_TESTS = [
|
||||||
('!"', ['!', '"']),
|
('!"', ['!', '"']),
|
||||||
('lány"a', ['lány', '"', 'a']),
|
('lány"a', ['lány', '"', 'a']),
|
||||||
|
@ -253,7 +273,7 @@ WIKI_TESTS = [
|
||||||
('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid'])
|
('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid'])
|
||||||
]
|
]
|
||||||
|
|
||||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS
|
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import contextlib
|
import contextlib
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -9,7 +8,6 @@ import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
import pathlib
|
|
||||||
from ...gold import GoldParse
|
from ...gold import GoldParse
|
||||||
from ...pipeline import EntityRecognizer
|
from ...pipeline import EntityRecognizer
|
||||||
from ...lang.en import English
|
from ...lang.en import English
|
||||||
|
@ -79,7 +77,8 @@ def test_issue910(EN, train_data, additional_entity_types):
|
||||||
2) There's no way to set the learning rate for the weight update, so we
|
2) There's no way to set the learning rate for the weight update, so we
|
||||||
end up out-of-scale, causing it to learn too fast.
|
end up out-of-scale, causing it to learn too fast.
|
||||||
'''
|
'''
|
||||||
doc = EN(u"I am looking for a restaurant in Berlin")
|
nlp = EN
|
||||||
|
doc = nlp(u"I am looking for a restaurant in Berlin")
|
||||||
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
# Fine tune the ner model
|
# Fine tune the ner model
|
||||||
for entity_type in additional_entity_types:
|
for entity_type in additional_entity_types:
|
||||||
|
|
|
@ -22,12 +22,12 @@ main.o-main.o-main--sidebar.o-main--aside
|
||||||
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
|
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
|
||||||
strong This page is part of the alpha documentation for spaCy v2.0.
|
strong This page is part of the alpha documentation for spaCy v2.0.
|
||||||
| It does not reflect the state of the latest stable release.
|
| It does not reflect the state of the latest stable release.
|
||||||
| Because v2.0 is still under development, the actual
|
| Because v2.0 is still under development, the implementation
|
||||||
| implementation may differ from the intended state described
|
| may differ from the intended state described here. See the
|
||||||
| here.
|
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
||||||
| #[+a("#") See here] for more information on how to install
|
| for details on how to install and test the new version. To
|
||||||
| and test the new version. To read the official docs for
|
| read the official docs for spaCy v1.x,
|
||||||
| v1.x, #[+a("https://spacy.io/docs") go here].
|
| #[+a("https://spacy.io/docs") go here].
|
||||||
|
|
||||||
!=yield
|
!=yield
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user