mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Update Polish tokenizer for UD_Polish-PDB (#5432)
Update Polish tokenizer for UD_Polish-PDB, which is a relatively major change from the existing tokenizer. Unused exceptions files and conflicting test cases removed. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
a5cd203284
commit
0061992d95
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults):
|
|||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
mod_base_exceptions = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
tokenizer_exceptions = mod_base_exceptions
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,23 +0,0 @@
|
|||
|
||||
Copyright (c) 2019, Marcin Miłkowski
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,22 +1,46 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
|
||||
from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ [CONCAT_ICONS]
|
||||
+ LIST_ICONS
|
||||
+ LIST_HYPHENS
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
["''", "’’", r"\.", "…"]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
r"(?<=[{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
||||
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
|
||||
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
|
||||
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
for orth in ["w.", "r."]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
for orth in PL_BASE_EXCEPTIONS:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
|
@ -4,49 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
DOT_TESTS = [
|
||||
("tel.", ["tel."]),
|
||||
("np.", ["np."]),
|
||||
("godz. 21:37", ["godz.", "21:37"]),
|
||||
("inż.", ["inż."]),
|
||||
("gosp.-polit.", ["gosp.-polit."]),
|
||||
("ppoż", ["ppoż"]),
|
||||
("płn", ["płn"]),
|
||||
("ul.", ["ul."]),
|
||||
("jw.", ["jw."]),
|
||||
("itd.", ["itd."]),
|
||||
("cdn.", ["cdn."]),
|
||||
("itp.", ["itp."]),
|
||||
("10,- zł", ["10,-", "zł"]),
|
||||
("tel.", ["tel", "."]),
|
||||
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
|
||||
("0,99 rub.", ["0,99", "rub."]),
|
||||
("dol.", ["dol."]),
|
||||
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
|
||||
("m.in.", ["m.in."]),
|
||||
("p.n.e.", ["p.n.e."]),
|
||||
("Sz.P.", ["Sz.P."]),
|
||||
("p.o.", ["p.o."]),
|
||||
("k.o.", ["k.o."]),
|
||||
("m.st.", ["m.st."]),
|
||||
("dra.", ["dra", "."]),
|
||||
("pp.", ["pp."]),
|
||||
("oo.", ["oo."]),
|
||||
]
|
||||
|
||||
HYPHEN_TESTS = [
|
||||
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
|
||||
("NESS-040C5", ["NESS-040C5"]),
|
||||
("JTE-7-31", ["JTE-7-31"]),
|
||||
("BAY-59-3074", ["BAY-59-3074"]),
|
||||
("BAY-38-7271", ["BAY-38-7271"]),
|
||||
("STS-135", ["STS-135"]),
|
||||
("5F-PB-22", ["5F-PB-22"]),
|
||||
("cztero-", ["cztero-"]),
|
||||
("jedno-", ["jedno-"]),
|
||||
("dwu-", ["dwu-"]),
|
||||
("trzy-", ["trzy-"]),
|
||||
("b-adoratorzy", ["b-adoratorzy"]),
|
||||
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
|
||||
("b-drzewa", ["b-drzewa"]),
|
||||
]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user