From b90e33469e1870b38bf70712cfd1b5a021d5fe24 Mon Sep 17 00:00:00 2001 From: thjbdvlt <109964512+thjbdvlt@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:17:33 +0200 Subject: [PATCH] universe-package-quelquhui (#13514) [ci skip] Co-authored-by: Ines Montani --- website/meta/universe.json | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 46be50665..ec8887276 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4552,6 +4552,26 @@ }, "category": ["standalone"] }, + { + "id": "quelquhui", + "title": "quelquhui", + "slogan": "Tokenizer for contemporary French", + "description": "A tokenizer for French that handles inword parentheses like in _(b)rouille_, inclusive language (won't split _relecteur.rice.s_,but will split _mais.maintenant_), hyphens (split _peut-on_, or _pouvons-vous_ but not _tubulu-pimpant_), apostrophes (split _j'arrive_ or _j'arrivons_, but not _aujourd'hui_ or _r'garder_), emoticons, text-emoji (_:happy:_), urls, mails and more.", + "github": "thjbdvlt/quelquhui", + "code_example": [ + "import spacy", + "import quelquhui", + "nlp = spacy.load('fr_core_news_lg')", + "nlp.tokenizer = quelquhui.Toquenizer(nlp.vocab)" + ], + "code_language": "python", + "author": "thjbdvlt", + "author_links": { + "github": "thjbdvlt" + }, + "category": ["pipeline"], + "tags": ["tokenizer", "french"] + }, { "id": "gliner-spacy", "title": "GLiNER spaCy Wrapper", @@ -4579,7 +4599,6 @@ "category": ["pipeline"], "tags": ["NER"] } - ], "categories": [