Merge pull request #4205 from adrianeboyd/feature/gold-train-orth-variants

Add train_docs() option to add orth variants
2025-10-15 08:16:36 +03:00 · 2019-08-28 16:54:06 +02:00 · 2019-08-28 16:54:06 +02:00 · 6b2ea883ed
commit 6b2ea883ed
parent af7fad2c6d 0a26e94d02
8 changed files with 140 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -19,6 +19,8 @@ It's commercial open-source software, released under the MIT license.
 [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
 [![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
 [![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square)](https://pypi.org/project/spacy/)
 [![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square)](https://anaconda.org/conda-forge/spacy)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
 [![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -7,6 +7,7 @@ import random
 import numpy
 import tempfile
 import shutil
 import itertools
 from pathlib import Path
 import srsly
@ -206,13 +207,14 @@ class GoldCorpus(object):
        return n
    def train_docs(self, nlp, gold_preproc=False, max_length=None,
-                    noise_level=0.0):
+                    noise_level=0.0, orth_variant_level=0.0):
        locs = list((self.tmp_dir / 'train').iterdir())
        random.shuffle(locs)
        train_tuples = self.read_tuples(locs, limit=self.limit)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
                                        noise_level=noise_level,
                                        orth_variant_level=orth_variant_level,
                                        make_projective=True)
        yield from gold_docs
@ -226,27 +228,32 @@ class GoldCorpus(object):
    @classmethod
    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
-                       noise_level=0.0, make_projective=False):
+                       noise_level=0.0, orth_variant_level=0.0, make_projective=False):
        for raw_text, paragraph_tuples in tuples:
            if gold_preproc:
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
-            docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc,
+            docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
-                                  noise_level=noise_level)
+                    paragraph_tuples, gold_preproc, noise_level=noise_level,
                    orth_variant_level=orth_variant_level)
            golds = cls._make_golds(docs, paragraph_tuples, make_projective)
            for doc, gold in zip(docs, golds):
                if (not max_length) or len(doc) < max_length:
                    yield doc, gold
    @classmethod
-    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0):
+    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
        if raw_text is not None:
            raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
            raw_text = add_noise(raw_text, noise_level)
-            return [nlp.make_doc(raw_text)]
+            return [nlp.make_doc(raw_text)], paragraph_tuples
        else:
            docs = []
            raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
-                    for (sent_tuples, brackets) in paragraph_tuples]
+                    for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
    @classmethod
    def _make_golds(cls, docs, paragraph_tuples, make_projective):
@ -263,6 +270,88 @@ class GoldCorpus(object):
                    in zip(docs, paragraph_tuples)]
 def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
    if random.random() >= orth_variant_level:
        return raw, paragraph_tuples
    ndsv = nlp.Defaults.single_orth_variants
    ndpv = nlp.Defaults.paired_orth_variants
    # modify words in paragraph_tuples
    variant_paragraph_tuples = []
    for sent_tuples, brackets in paragraph_tuples:
        ids, words, tags, heads, labels, ner = sent_tuples
        # single variants
        punct_choices = [random.choice(x["variants"]) for x in ndsv]
        for word_idx in range(len(words)):
            for punct_idx in range(len(ndsv)):
                if tags[word_idx] in ndsv[punct_idx]["tags"] \
                        and words[word_idx] in ndsv[punct_idx]["variants"]:
                    words[word_idx] = punct_choices[punct_idx]
        # paired variants
        punct_choices = [random.choice(x["variants"]) for x in ndpv]
        for word_idx in range(len(words)):
            for punct_idx in range(len(ndpv)):
                if tags[word_idx] in ndpv[punct_idx]["tags"] \
                        and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
                    # backup option: random left vs. right from pair
                    pair_idx = random.choice([0, 1])
                    # best option: rely on paired POS tags like `` / ''
                    if len(ndpv[punct_idx]["tags"]) == 2:
                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
                    # next best option: rely on position in variants
                    # (may not be unambiguous, so order of variants matters)
                    else:
                        for pair in ndpv[punct_idx]["variants"]:
                            if words[word_idx] in pair:
                                pair_idx = pair.index(words[word_idx])
                    words[word_idx] = punct_choices[punct_idx][pair_idx]
        variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
    # modify raw to match variant_paragraph_tuples
    if raw is not None:
        variants = []
        for single_variants in ndsv:
            variants.extend(single_variants["variants"])
        for paired_variants in ndpv:
            variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
        # store variants in reverse length order to be able to prioritize
        # longer matches (e.g., "---" before "--")
        variants = sorted(variants, key=lambda x: len(x))
        variants.reverse()
        variant_raw = ""
        raw_idx = 0
        # add initial whitespace
        while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
            variant_raw += raw[raw_idx]
            raw_idx += 1
        for sent_tuples, brackets in variant_paragraph_tuples:
            ids, words, tags, heads, labels, ner = sent_tuples
            for word in words:
                match_found = False
                # add identical word
                if word not in variants and raw[raw_idx:].startswith(word):
                    variant_raw += word
                    raw_idx += len(word)
                    match_found = True
                # add variant word
                else:
                    for variant in variants:
                        if not match_found and \
                                raw[raw_idx:].startswith(variant):
                            raw_idx += len(variant)
                            variant_raw += word
                            match_found = True
                # something went wrong, abort
                # (add a warning message?)
                if not match_found:
                    return raw, paragraph_tuples
                # add following whitespace
                while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
                    variant_raw += raw[raw_idx]
                    raw_idx += 1
        return variant_raw, variant_paragraph_tuples
    return raw, variant_paragraph_tuples
 def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -27,6 +27,10 @@ class GermanDefaults(Language.Defaults):
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    resources = {"lemma_lookup": "lemma_lookup.json"}
    single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]},
            {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}]
    paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘")]},
            {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“")]}]
 class German(Language):
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -38,6 +38,10 @@ class EnglishDefaults(Language.Defaults):
        "lemma_index": "lemmatizer/lemma_index.json",
        "lemma_exc": "lemmatizer/lemma_exc.json",
    }
    single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]},
            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}]
    paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
            {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}]
 class English(Language):
--- a/spacy/language.py
+++ b/spacy/language.py
@ -109,6 +109,8 @@ class BaseDefaults(object):
    syntax_iterators = {}
    resources = {}
    writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
    single_orth_variants = []
    paired_orth_variants = []
 class Language(object):
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -192,7 +192,7 @@ browser. Will run a simple web server.
 | `style`   | unicode             | Visualization style, `'dep'` or `'ent'`.                                                                                             | `'dep'`     |
 | `page`    | bool                | Render markup as full HTML page.                                                                                                     | `True`      |
 | `minify`  | bool                | Minify HTML markup.                                                                                                                  | `False`     |
-| `options` | dict                | [Visualizer-specific options](#options), e.g. colors.                                                                                | `{}`        |
+| `options` | dict                | [Visualizer-specific options](#displacy_options), e.g. colors.                                                                                | `{}`        |
 | `manual`  | bool                | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False`     |
 | `port`    | int                 | Port to serve visualization.                                                                                                         | `5000`      |
 | `host`    | unicode             | Host to serve visualization.                                                                                                         | `'0.0.0.0'` |
@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization.
 | `page`      | bool                | Render markup as full HTML page.                                                                                                                          | `False` |
 | `minify`    | bool                | Minify HTML markup.                                                                                                                                       | `False` |
 | `jupyter`   | bool                | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None`  |
-| `options`   | dict                | [Visualizer-specific options](#options), e.g. colors.                                                                                                     | `{}`    |
+| `options`   | dict                | [Visualizer-specific options](#displacy_options), e.g. colors.                                                                                                     | `{}`    |
 | `manual`    | bool                | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples.                      | `False` |
 | **RETURNS** | unicode             | Rendered HTML markup.                                                                                                                                     |
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -9,6 +9,7 @@
                "en_core_web_lg",
                "en_vectors_web_lg",
                "en_pytt_bertbaseuncased_lg",
                "en_pytt_robertabase_lg",
                "en_pytt_xlnetbasecased_lg"
            ],
            "example": "This is a sentence.",
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1715,6 +1715,35 @@
                "\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)"
            ],
            "author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram"
        },
        {
            "id": "num_fh",
            "title": "Numeric Fused-Head",
            "slogan": "Numeric Fused-Head Identificaiton and Resolution in English",
            "description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.",
            "github": "yanaiela/num_fh",
            "pip": "num_fh",
            "category": ["pipeline", "research"],
            "code_example": [
                "import spacy",
                "from num_fh import NFH",
                "nlp = spacy.load('en_core_web_sm')",
                "nfh = NFH(nlp)",
                "nlp.add_pipe(nfh, first=False)",
                "doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")",
                "",
                "assert doc[16]._.is_nfh == True",
                "assert doc[18]._.is_nfh == False",
                "assert doc[3]._.is_deter_nfh == True",
                "assert doc[16]._.is_deter_nfh == False",
                "assert len(doc._.nfh) == 4"
            ],
            "author": "Yanai Elazar",
            "author_links": {
                "github": "yanaiela",
                "twitter": "yanaiela",
                "website": "https://yanaiela.github.io"
            }
        }
    ],