This commit is contained in:
Donald Winkelman 2025-11-29 17:12:15 -05:00 committed by GitHub
commit ed6a6d3623
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,5 +1,55 @@
{
"resources": [
{
"id": "MorphSeg",
"title": "MorphSeg",
"thumb": "https://raw.githubusercontent.com/TheWelcomer/MorphSeg/master/library/logo.png",
"url": "https://pypi.org/project/morphseg/",
"slogan": "Morpheme Segmentation for Several Languages using SpaCy",
"description": "[MorphSeg](https://github.com/TheWelcomer/MorphSeg) is a morpheme segmentation library and SpaCy pipeline which supports segmentation for 8 languages (english, spanish, russian, french, italian, czech, hungarian, and latin). The pretrained models are high-accuracy, small (~3M Params), and efficient (~500 words/second on a Macbook GPU) neural nets. The interface is designed to be simple, just use spacy as usual and add the morpheme_segmenter pipeline to get segmentations!\n- [Demo Website](https://huggingface.co/spaces/Morphological-Segmentation/Morpheme_Segmentation_Demo)\n- [Demo Colab Notebook](https://colab.research.google.com/drive/1alisdnbCQCRhvdT9DhMnRNRuDLIZLZho#scrollTo=QB6uAXxWoffA)\n- [GitHub](https://github.com/TheWelcomer/MorphSeg)\n- [PyPI Package](https://pypi.org/project/morphseg/)\n- [Hugging Face Repository](https://huggingface.co/MorphSeg)",
"github": "TheWelcomer/MorphSeg",
"pip": "morphseg",
"code_example": [
"import morphseg",
"import spacy",
"",
"# Load your existing spaCy model or a blank NLP object",
"nlp = spacy.blank('en')",
"",
"# Add the morpheme segmenter to the pipeline",
"nlp.add_pipe('morpheme_segmenter')",
"",
"# Process the text",
"doc = nlp('The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization')",
"",
"# Access morphemes for each token",
"for token in doc:",
" print(f'{token.text}: {token._.morphemes}')",
"",
"# Access morphemes for spans",
"span = doc[1:3]",
"print(f'Span morphemes: {span._.morphemes}')",
"",
"# Access morphemes for the entire document",
"print(f'All morphemes: {doc._.morphemes}') # [['the'], ['un', 'believe', 'able', 'ly'], ['dis', 'agree', 'able'], ['pre', 'process', 'or'], ['un', 'success', 'ful', 'ly'], ['re', 'process', 'ed'], ['the'], ['un', 'question', 'able', 'ly'], ['in', 'reverse', 'ible'], ['decontextual', 'ization']]"
],
"code_language": "python",
"author": "Donald Winkelman",
"author_links": {
"github": "TheWelcomer",
"website": "https://dwink.dev"
},
"category": [
"pipeline",
"standalone"
],
"tags": [
"morphology",
"segmentation",
"linguistics",
"multilingual"
]
},
{
"id": "TeNs",
"title": "Temporal Expressions Normalization spaCy",