Add TeNs plugin (#13800)

Co-authored-by: Ilie Cristian Dorobat <idorobat@cisco.com>
2025-09-19 10:32:40 +03:00 · 2025-05-27 02:21:07 +03:00 · 2025-05-27 02:21:07 +03:00 · bec546cec0
commit bec546cec0
parent 46613e27cf
1 changed files with 69 additions and 0 deletions
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,74 @@
 {
    "resources": [
        {
            "id": "TeNs",
            "title": "Temporal Expressions Normalization spaCy",
            "thumb": "https://github-production-user-asset-6210df.s3.amazonaws.com/40547052/433595900-fae3c9d9-7181-4d8b-8b49-e6dc4fca930b.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250414%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250414T235545Z&X-Amz-Expires=300&X-Amz-Signature=e21d3c06300ceb15fa1dadd7cb60081cc9f1b35e5a7bfd07f6e8b90dd7fad9d0&X-Amz-SignedHeaders=host",
            "url": "https://pypi.org/project/temporal-normalization-spacy/",
            "slogan": "A temporal expression normalization plugin for Romanian using rule-based methods and DBpedia mappings.",
            "description": "**[Temporal Expressions Normalization spaCy (TeNs)](https://github.com/iliedorobat/timespan-normalization-spacy)** is a powerful pipeline component for spaCy that seamlessly identifies and parses date entities in text. It leverages the **[Temporal Expressions Normalization Framework]( https://github.com/iliedorobat/timespan-normalization)** to recognize a wide variety of date formats using an extensive set of regular expressions (RegEx), ensuring robust and adaptable date extraction across diverse textual sources.\n\nUnlike conventional solutions that primarily focus on well-structured date formats, TeNs excels in handling real-world text by **identifying** not only standard date representations but also **abbreviated, informal, or even misspelled temporal expressions.** This makes it particularly effective for processing noisy or unstructured data, such as historical records, user-generated content, and scanned documents with OCR inaccuracies.",
            "github": "iliedorobat/timespan-normalization-spacy",
            "pip": "temporal-normalization-spacy",
            "code_example": [
                "import subprocess",
                "",
                "import spacy",
                "",
                "from temporal_normalization.commons.print_utils import console",
                "from temporal_normalization.index import create_normalized_component, TemporalNormalization  # noqa: F401",
                "",
                "",
                "try:",
                "    # Load the spaCy model if it has already been downloaded",
                "    nlp = spacy.load('ro_core_news_sm')",
                "except OSError:",
                "    console.warning(f'Started downloading ro_core_news_sm...')",
                "    # Download the Romanian model if it wasn't already downloaded",
                "    subprocess.run(['python', '-m', 'spacy', 'download', 'ro_core_news_sm'])",
                "    # Load the spaCy model",
                "    nlp = spacy.load('ro_core_news_sm')",
                "",
                "# Add 'temporal_normalization' component to the spaCy pipeline",
                "nlp.add_pipe('temporal_normalization', last=True)",
                "doc = nlp('Sec al II-lea a.ch. - I d.ch reprezintă o perioadă de mari schimbări.')",
                "",
                "# Display information about the identified and normalized dates in the text.",
                "for entity in doc.ents:",
                "    edges = entity._.time_series.edges",
                "",
                "    print('Start Edge:')",
                "    print(edges.start.serialize('\\t'))",
                "    print()",
                "",
                "    print('End Edge:')",
                "    print(edges.end.serialize('\\t'))",
                "    print()",
                "",
                "    print('Periods:')",
                "    for period in entity._.time_series.periods:",
                "        print(period.serialize('\\t'))",
                "        print()",
                "    print('---------------------')"
            ],
            "code_language": "python",
            "author": "Ilie Cristian Dorobat",
            "author_links": {
                "github": "iliedorobat",
                "website": "https://iliedorobat.ro/"
            },
            "category": [
                "pipeline",
                "standalone"
            ],
            "tags": [
                "temporal",
                "normalization",
                "date",
                "romanian",
                "temporal-expression",
                "dbpedia"
            ]
        },
        {
            "id": "spacy-vscode",
            "title": "spaCy Visual Studio Code Extension",