mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Update Polish tokenizer for UD_Polish-PDB (#5432)
Update Polish tokenizer for UD_Polish-PDB, which is a relatively major change from the existing tokenizer. Unused exceptions files and conflicting test cases removed. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
a5cd203284
commit
0061992d95
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
mod_base_exceptions = {
|
||||||
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
|
}
|
||||||
|
tokenizer_exceptions = mod_base_exceptions
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,23 +0,0 @@
|
||||||
|
|
||||||
Copyright (c) 2019, Marcin Miłkowski
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are met:
|
|
||||||
|
|
||||||
1. Redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer.
|
|
||||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
||||||
this list of conditions and the following disclaimer in the documentation
|
|
||||||
and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
||||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
||||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
||||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
@ -1,22 +1,46 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
|
from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
|
||||||
|
from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
|
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
+ [CONCAT_ICONS]
|
+ LIST_ICONS
|
||||||
|
+ LIST_HYPHENS
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
["''", "’’", r"\.", "…"]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
|
@ -1,26 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
|
||||||
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
|
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
|
||||||
|
|
||||||
for exc_data in [
|
|
||||||
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
|
|
||||||
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
|
|
||||||
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
|
||||||
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
|
||||||
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
|
||||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
|
|
||||||
]:
|
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
|
||||||
|
|
||||||
for orth in ["w.", "r."]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
|
||||||
|
|
||||||
for orth in PL_BASE_EXCEPTIONS:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
|
|
@ -4,49 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
DOT_TESTS = [
|
DOT_TESTS = [
|
||||||
("tel.", ["tel."]),
|
("tel.", ["tel", "."]),
|
||||||
("np.", ["np."]),
|
|
||||||
("godz. 21:37", ["godz.", "21:37"]),
|
|
||||||
("inż.", ["inż."]),
|
|
||||||
("gosp.-polit.", ["gosp.-polit."]),
|
|
||||||
("ppoż", ["ppoż"]),
|
|
||||||
("płn", ["płn"]),
|
|
||||||
("ul.", ["ul."]),
|
|
||||||
("jw.", ["jw."]),
|
|
||||||
("itd.", ["itd."]),
|
|
||||||
("cdn.", ["cdn."]),
|
|
||||||
("itp.", ["itp."]),
|
|
||||||
("10,- zł", ["10,-", "zł"]),
|
|
||||||
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
|
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
|
||||||
("0,99 rub.", ["0,99", "rub."]),
|
|
||||||
("dol.", ["dol."]),
|
|
||||||
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
|
|
||||||
("m.in.", ["m.in."]),
|
|
||||||
("p.n.e.", ["p.n.e."]),
|
|
||||||
("Sz.P.", ["Sz.P."]),
|
|
||||||
("p.o.", ["p.o."]),
|
|
||||||
("k.o.", ["k.o."]),
|
|
||||||
("m.st.", ["m.st."]),
|
|
||||||
("dra.", ["dra", "."]),
|
|
||||||
("pp.", ["pp."]),
|
|
||||||
("oo.", ["oo."]),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
HYPHEN_TESTS = [
|
HYPHEN_TESTS = [
|
||||||
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
|
|
||||||
("NESS-040C5", ["NESS-040C5"]),
|
|
||||||
("JTE-7-31", ["JTE-7-31"]),
|
|
||||||
("BAY-59-3074", ["BAY-59-3074"]),
|
|
||||||
("BAY-38-7271", ["BAY-38-7271"]),
|
|
||||||
("STS-135", ["STS-135"]),
|
|
||||||
("5F-PB-22", ["5F-PB-22"]),
|
|
||||||
("cztero-", ["cztero-"]),
|
("cztero-", ["cztero-"]),
|
||||||
("jedno-", ["jedno-"]),
|
("jedno-", ["jedno-"]),
|
||||||
("dwu-", ["dwu-"]),
|
("dwu-", ["dwu-"]),
|
||||||
("trzy-", ["trzy-"]),
|
("trzy-", ["trzy-"]),
|
||||||
("b-adoratorzy", ["b-adoratorzy"]),
|
|
||||||
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
|
|
||||||
("b-drzewa", ["b-drzewa"]),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user