add ressource 'presque' in universe.json

2026-01-16 13:29:05 +03:00 · 2024-05-31 18:25:47 +02:00 · 2024-05-31 18:25:47 +02:00 · cb1f9457b4
commit cb1f9457b4
parent 3fa464b8c7
1 changed files with 17 additions and 7 deletions
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -4519,19 +4519,29 @@
            "category": ["standalone"]
        },
        {
-            "id": "quelquhui",
-            "title": "quelquhui",
-            "slogan": "tokenizer for contemporary french",
-            "description": "A tokenizer for french that handles inword parentheses like in _(b)rouille_, inclusive language (won't split _relecteur.rice.s_,but will split _mais.maintenant_), hyphens (split _peut-on_, or _pouvons-vous_ but not _tubulu-pimpant_), apostrophes (split _j'arrive_ or _j'arrivons_, but not _aujourd'hui_ or _r'garder_), emoticons, text-emoji (_:happy:_), urls, mails, ...",
-            "github": "thjbdvlt/quelquhui",
-            "code_example": ["import spacy", "import quelquhui", "nlp = spacy.load('fr_core_news_lg')", "nlp.tokenizer = quelquhui.Toquenizer(nlp.vocab)"],
+            "id": "presque",
+            "title": "presque",
+            "slogan": "normalizer for contemporary french.",
+            "description": "normalizer for french with focus on online and informal communication, _peùUUUt-èTRE_ becomes _peut-être_, _voilaaaa_ becomes _voilà_. it also harmonizes inclusive language (the user can chose how): by default, _auteur-rice-s-x et relecteur.xrices_ becomes _auteur·ricexs et relecteur·ricexs_.",
+            "github": "thjbdvlt/presque",
+            "code_example": [
+                "import spacy",
+                "import presque",
+                "",
+                "@spacy.Language.factory('presque_normalizer')",
+                "def create_presque_normalizer(nlp, name='presque_normalizer'):",
+                "return presque.Normalizer(nlp=nlp)",
+                "",
+                "nlp = spacy.load('fr_core_news_lg')",
+                "nlp.add_pipe('presque_normalizer', first=True)"
+            ],
            "code_language": "python",
            "author": "thjbdvlt",
            "author_links": {
                "github": "thjbdvlt"
            },
            "category": ["pipeline"],
-            "tags": ["tokenizer", "french"]
+            "tags": ["normalizer", "french"]
        }
    ],