mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Improvements for Finnish tokenizer (#4985)
* don't split on a colon. Colon is used to attach suffixes for abbreviations * tokenize on any of LIST_HYPHENS (except a single hyphen), not just on -- * simplify infix rules by merging similar rules
This commit is contained in:
parent
479e81bafc
commit
e1f777b151
|
@ -1,12 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
@ -14,11 +15,9 @@ _infixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
|
||||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,9 @@ for exc_data in [
|
||||||
{ORTH: "myöh.", LEMMA: "myöhempi"},
|
{ORTH: "myöh.", LEMMA: "myöhempi"},
|
||||||
{ORTH: "n.", LEMMA: "noin"},
|
{ORTH: "n.", LEMMA: "noin"},
|
||||||
{ORTH: "nimim.", LEMMA: "nimimerkki"},
|
{ORTH: "nimim.", LEMMA: "nimimerkki"},
|
||||||
|
{ORTH: "n:o", LEMMA: "numero"},
|
||||||
|
{ORTH: "N:o", LEMMA: "numero"},
|
||||||
|
{ORTH: "nro", LEMMA: "numero"},
|
||||||
{ORTH: "ns.", LEMMA: "niin sanottu"},
|
{ORTH: "ns.", LEMMA: "niin sanottu"},
|
||||||
{ORTH: "nyk.", LEMMA: "nykyinen"},
|
{ORTH: "nyk.", LEMMA: "nykyinen"},
|
||||||
{ORTH: "oik.", LEMMA: "oikealla"},
|
{ORTH: "oik.", LEMMA: "oikealla"},
|
||||||
|
|
|
@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ABBREVIATION_INFLECTION_TESTS = [
|
||||||
|
(
|
||||||
|
"VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
|
||||||
|
["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"ALV:n osuus on 24 %.",
|
||||||
|
["ALV:n", "osuus", "on", "24", "%", "."]
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Hiihtäjä oli kilpailun 14:s.",
|
||||||
|
["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
||||||
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
|
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
|
||||||
|
@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
|
||||||
tokens = fi_tokenizer(text)
|
tokens = fi_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
|
||||||
|
def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
|
||||||
|
tokens = fi_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
||||||
|
|
Loading…
Reference in New Issue
Block a user