Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2020-02-10 20:34:22 -05:00
commit 13b516289b
6 changed files with 32 additions and 8 deletions

View File

@ -5,7 +5,7 @@ thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0 wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0 srsly>=1.0.1,<1.1.0
catalogue>=0.0.7,<1.1.0 catalogue>=0.0.7,<1.1.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0

View File

@ -47,7 +47,7 @@ install_requires =
thinc==7.4.0.dev0 thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0 wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0 srsly>=1.0.1,<1.1.0
catalogue>=0.0.7,<1.1.0 catalogue>=0.0.7,<1.1.0
# Third-party dependencies # Third-party dependencies
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0

View File

@ -1,12 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
@ -14,11 +15,9 @@ _infixes = (
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -31,6 +31,9 @@ for exc_data in [
{ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "myöh.", LEMMA: "myöhempi"},
{ORTH: "n.", LEMMA: "noin"}, {ORTH: "n.", LEMMA: "noin"},
{ORTH: "nimim.", LEMMA: "nimimerkki"}, {ORTH: "nimim.", LEMMA: "nimimerkki"},
{ORTH: "n:o", LEMMA: "numero"},
{ORTH: "N:o", LEMMA: "numero"},
{ORTH: "nro", LEMMA: "numero"},
{ORTH: "ns.", LEMMA: "niin sanottu"}, {ORTH: "ns.", LEMMA: "niin sanottu"},
{ORTH: "nyk.", LEMMA: "nykyinen"}, {ORTH: "nyk.", LEMMA: "nykyinen"},
{ORTH: "oik.", LEMMA: "oikealla"}, {ORTH: "oik.", LEMMA: "oikealla"},

View File

@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
) )
] ]
ABBREVIATION_INFLECTION_TESTS = [
(
"VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
),
(
"ALV:n osuus on 24 %.",
["ALV:n", "osuus", "on", "24", "%", "."]
),
(
"Hiihtäjä oli kilpailun 14:s.",
["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
)
]
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text) tokens = fi_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
### Disabling the parser {#disabling} ### Disabling the parser {#disabling}
In the [default models](/models), the parser is loaded and enabled as part of In the [default models](/models), the parser is loaded and enabled as part of
the [standard processing pipeline](/usage/processing-pipelin). If you don't need the [standard processing pipeline](/usage/processing-pipelines). If you don't need
any of the syntactic information, you should disable the parser. Disabling the any of the syntactic information, you should disable the parser. Disabling the
parser will make spaCy load and run much faster. If you want to load the parser, parser will make spaCy load and run much faster. If you want to load the parser,
but need to disable it for specific documents, you can also control its use on but need to disable it for specific documents, you can also control its use on