From 8fc2a5f13bdff33e3d6f7a3eff951a0276249602 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 22 Feb 2023 14:55:26 +0100 Subject: [PATCH] create punctuation.py for swedish --- spacy/lang/sv/__init__.py | 5 +---- spacy/lang/sv/punctuation.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 spacy/lang/sv/punctuation.py diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6963e8b79..28e5085a8 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer - - -# Punctuation stolen from Danish -from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES class SwedishDefaults(BaseDefaults): diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py new file mode 100644 index 000000000..e050ab7aa --- /dev/null +++ b/spacy/lang/sv/punctuation.py @@ -0,0 +1,31 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] +_suffixes += [r"(?<=[^sSxXzZ])\'"] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes