From 3e8bc1272f95c89e0aa9e5a19f51e286a7934ffa Mon Sep 17 00:00:00 2001
From: Jacobo Myerston <43222279+jmyerston@users.noreply.github.com>
Date: Tue, 27 Sep 2022 02:38:56 -0700
Subject: [PATCH] add punctuation to grc (#11426)

* add punctuation to grc

Add support for special editorial punctuation that is common in ancient Greek texts.  Ancient Greek texts, as found in digital and print form, have been largely edited by scholars. Restorations and improvements are normally marked with special characters that need to be handled properly by the tokenizer.

* add unit tests

* simplify regex

* move generic quotes to char classes

* rename unit test

* fix regex

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/char_classes.py             |  2 +-
 spacy/lang/grc/__init__.py             |  4 +++
 spacy/lang/grc/punctuation.py          | 46 ++++++++++++++++++++++++++
 spacy/tests/lang/grc/test_tokenizer.py | 18 ++++++++++
 4 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 spacy/lang/grc/punctuation.py
 create mode 100644 spacy/tests/lang/grc/test_tokenizer.py

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 1d204c46c..37c58c85f 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -280,7 +280,7 @@ _currency = (
 _punct = (
     r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ۔ ؛ ٪"
 )
-_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
+_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉 〈 〉  ⟦ ⟧'
 _hyphens = "- – — -- --- —— ~"
 
 # Various symbols like dingbats, but also emoji
diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py
index e83f0c5a5..019b3802e 100644
--- a/spacy/lang/grc/__init__.py
+++ b/spacy/lang/grc/__init__.py
@@ -1,11 +1,15 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 
 
 class AncientGreekDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
new file mode 100644
index 000000000..8f3589e9a
--- /dev/null
+++ b/spacy/lang/grc/punctuation.py
@@ -0,0 +1,46 @@
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
+from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from ..char_classes import CONCAT_QUOTES
+
+_prefixes = (
+    [
+        "†",
+        "⸏",
+    ]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)
+
+_suffixes = (
+    LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        "†",
+        "⸎",
+        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
+    ]
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
+    ]
+)
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/tests/lang/grc/test_tokenizer.py b/spacy/tests/lang/grc/test_tokenizer.py
new file mode 100644
index 000000000..3df5b546b
--- /dev/null
+++ b/spacy/tests/lang/grc/test_tokenizer.py
@@ -0,0 +1,18 @@
+import pytest
+
+
+# fmt: off
+GRC_TOKEN_EXCEPTION_TESTS = [
+    ("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]),
+    ("τὴν δὲ τῶν Αἰγυπτίων φιλοσοφίαν εἶναι τοιαύτην περί τε †θεῶν† καὶ ὑπὲρ δικαιοσύνης.", ["τὴν", "δὲ", "τῶν", "Αἰγυπτίων", "φιλοσοφίαν", "εἶναι", "τοιαύτην", "περί", "τε", "†", "θεῶν", "†", "καὶ", "ὑπὲρ", "δικαιοσύνης", "."]),
+    ("⸏πόσις δ' Ἐρεχθεύς ἐστί μοι σεσωσμένος⸏", ["⸏", "πόσις", "δ'", "Ἐρεχθεύς", "ἐστί", "μοι", "σεσωσμένος", "⸏"]),
+    ("⸏ὔπνον ἴδωμεν⸎", ["⸏", "ὔπνον", "ἴδωμεν", "⸎"]),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", GRC_TOKEN_EXCEPTION_TESTS)
+def test_grc_tokenizer(grc_tokenizer, text, expected_tokens):
+    tokens = grc_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list