mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Bugfix/swedish tokenizer (#12315)
* add unittest for explosion#12311 * create punctuation.py for swedish * removed : from infixes in swedish punctuation.py * allow : as infix if succeeding char is uppercase
This commit is contained in:
parent
4539fbae17
commit
e2de188cf1
|
@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|
||||||
|
|
||||||
|
|
||||||
class SwedishDefaults(BaseDefaults):
|
class SwedishDefaults(BaseDefaults):
|
||||||
|
|
33
spacy/lang/sv/punctuation.py
Normal file
33
spacy/lang/sv/punctuation.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_suffixes = [
|
||||||
|
suffix
|
||||||
|
for suffix in TOKENIZER_SUFFIXES
|
||||||
|
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||||
|
]
|
||||||
|
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
||||||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(12311)
|
||||||
|
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
|
||||||
|
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
|
||||||
|
tokens = sv_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user