From a71ec2a5c5dd2377e8e331a6fd3f42e819c310f8 Mon Sep 17 00:00:00 2001 From: Donald Winkelman Date: Mon, 24 Nov 2025 12:08:22 -0500 Subject: [PATCH] Adding MorphSeg extension to SpaCy Universe --- website/meta/universe.json | 50 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 1f55d9616..7bb31abc1 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,55 @@ { "resources": [ + { + "id": "MorphSeg", + "title": "MorphSeg", + "thumb": "https://raw.githubusercontent.com/TheWelcomer/MorphSeg/master/library/logo.png", + "url": "https://pypi.org/project/morphseg/", + "slogan": "Morpheme Segmentation for Several Languages using SpaCy", + "description": "[MorphSeg](https://github.com/TheWelcomer/MorphSeg) is a morpheme segmentation library and SpaCy pipeline which supports segmentation for 8 languages (english, spanish, russian, french, italian, czech, hungarian, and latin). The pretrained models are high-accuracy, small (~3M Params), and efficient (~500 words/second on a Macbook GPU) neural nets. The interface is designed to be simple, just use spacy as usual and add the morpheme_segmenter pipeline to get segmentations!\n- [Demo Website](https://huggingface.co/spaces/Morphological-Segmentation/Morpheme_Segmentation_Demo)\n- [Demo Colab Notebook](https://colab.research.google.com/drive/1alisdnbCQCRhvdT9DhMnRNRuDLIZLZho#scrollTo=QB6uAXxWoffA)\n- [GitHub](https://github.com/TheWelcomer/MorphSeg)\n- [PyPI Package](https://pypi.org/project/morphseg/)\n- [Hugging Face Repository](https://huggingface.co/MorphSeg)", + "github": "TheWelcomer/MorphSeg", + "pip": "morphseg", + "code_example": [ + "import morphseg", + "import spacy", + "", + "# Load your existing spaCy model or a blank NLP object", + "nlp = spacy.blank('en')", + "", + "# Add the morpheme segmenter to the pipeline", + "nlp.add_pipe('morpheme_segmenter')", + "", + "# Process the text", + "doc = nlp('The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization')", + "", + "# Access morphemes for each token", + "for token in doc:", + " print(f'{token.text}: {token._.morphemes}')", + "", + "# Access morphemes for spans", + "span = doc[1:3]", + "print(f'Span morphemes: {span._.morphemes}')", + "", + "# Access morphemes for the entire document", + "print(f'All morphemes: {doc._.morphemes}') # [['the'], ['un', 'believe', 'able', 'ly'], ['dis', 'agree', 'able'], ['pre', 'process', 'or'], ['un', 'success', 'ful', 'ly'], ['re', 'process', 'ed'], ['the'], ['un', 'question', 'able', 'ly'], ['in', 'reverse', 'ible'], ['decontextual', 'ization']]" + ], + "code_language": "python", + "author": "Donald Winkelman", + "author_links": { + "github": "TheWelcomer", + "website": "https://dwink.dev" + }, + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "morphology", + "segmentation", + "linguistics", + "multilingual" + ] + }, { "id": "TeNs", "title": "Temporal Expressions Normalization spaCy",