Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2020-02-10 20:34:22 -05:00
commit 13b516289b
6 changed files with 32 additions and 8 deletions

View File

@ -5,7 +5,7 @@ thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
srsly>=1.0.1,<1.1.0
catalogue>=0.0.7,<1.1.0
# Third party dependencies
numpy>=1.15.0

View File

@ -47,7 +47,7 @@ install_requires =
thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
srsly>=1.0.1,<1.1.0
catalogue>=0.0.7,<1.1.0
# Third-party dependencies
tqdm>=4.38.0,<5.0.0

View File

@ -1,12 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
_infixes = (
LIST_ELLIPSES
@ -14,11 +15,9 @@ _infixes = (
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
]
)

View File

@ -31,6 +31,9 @@ for exc_data in [
{ORTH: "myöh.", LEMMA: "myöhempi"},
{ORTH: "n.", LEMMA: "noin"},
{ORTH: "nimim.", LEMMA: "nimimerkki"},
{ORTH: "n:o", LEMMA: "numero"},
{ORTH: "N:o", LEMMA: "numero"},
{ORTH: "nro", LEMMA: "numero"},
{ORTH: "ns.", LEMMA: "niin sanottu"},
{ORTH: "nyk.", LEMMA: "nykyinen"},
{ORTH: "oik.", LEMMA: "oikealla"},

View File

@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
)
]
ABBREVIATION_INFLECTION_TESTS = [
(
"VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
),
(
"ALV:n osuus on 24 %.",
["ALV:n", "osuus", "on", "24", "%", "."]
),
(
"Hiihtäjä oli kilpailun 14:s.",
["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
)
]
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
### Disabling the parser {#disabling}
In the [default models](/models), the parser is loaded and enabled as part of
the [standard processing pipeline](/usage/processing-pipelin). If you don't need
the [standard processing pipeline](/usage/processing-pipelines). If you don't need
any of the syntactic information, you should disable the parser. Disabling the
parser will make spaCy load and run much faster. If you want to load the parser,
but need to disable it for specific documents, you can also control its use on