Merge pull request #4205 from adrianeboyd/feature/gold-train-orth-variants

Add train_docs() option to add orth variants
This commit is contained in:
Matthew Honnibal 2019-08-28 16:54:06 +02:00 committed by GitHub
commit 6b2ea883ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 140 additions and 9 deletions

View File

@ -19,6 +19,8 @@ It's commercial open-source software, released under the MIT license.
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/) [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases) [![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square)](https://pypi.org/project/spacy/)
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square)](https://anaconda.org/conda-forge/spacy)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io) [![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)

View File

@ -7,6 +7,7 @@ import random
import numpy import numpy
import tempfile import tempfile
import shutil import shutil
import itertools
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -206,13 +207,14 @@ class GoldCorpus(object):
return n return n
def train_docs(self, nlp, gold_preproc=False, max_length=None, def train_docs(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0): noise_level=0.0, orth_variant_level=0.0):
locs = list((self.tmp_dir / 'train').iterdir()) locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs) random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit) train_tuples = self.read_tuples(locs, limit=self.limit)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length, max_length=max_length,
noise_level=noise_level, noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True) make_projective=True)
yield from gold_docs yield from gold_docs
@ -226,27 +228,32 @@ class GoldCorpus(object):
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0, make_projective=False): noise_level=0.0, orth_variant_level=0.0, make_projective=False):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
if gold_preproc: if gold_preproc:
raw_text = None raw_text = None
else: else:
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
noise_level=noise_level) paragraph_tuples, gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective) golds = cls._make_golds(docs, paragraph_tuples, make_projective)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length: if (not max_length) or len(doc) < max_length:
yield doc, gold yield doc, gold
@classmethod @classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
if raw_text is not None: if raw_text is not None:
raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
raw_text = add_noise(raw_text, noise_level) raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)], paragraph_tuples
else: else:
docs = []
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective): def _make_golds(cls, docs, paragraph_tuples, make_projective):
@ -263,6 +270,88 @@ class GoldCorpus(object):
in zip(docs, paragraph_tuples)] in zip(docs, paragraph_tuples)]
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return raw, paragraph_tuples
ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_paragraph_tuples = []
for sent_tuples, brackets in paragraph_tuples:
ids, words, tags, heads, labels, ner = sent_tuples
# single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx]
# paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
for single_variants in ndsv:
variants.extend(single_variants["variants"])
for paired_variants in ndpv:
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
# store variants in reverse length order to be able to prioritize
# longer matches (e.g., "---" before "--")
variants = sorted(variants, key=lambda x: len(x))
variants.reverse()
variant_raw = ""
raw_idx = 0
# add initial whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
for sent_tuples, brackets in variant_paragraph_tuples:
ids, words, tags, heads, labels, ner = sent_tuples
for word in words:
match_found = False
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
# add variant word
else:
for variant in variants:
if not match_found and \
raw[raw_idx:].startswith(variant):
raw_idx += len(variant)
variant_raw += word
match_found = True
# something went wrong, abort
# (add a warning message?)
if not match_found:
return raw, paragraph_tuples
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
return variant_raw, variant_paragraph_tuples
return raw, variant_paragraph_tuples
def add_noise(orig, noise_level): def add_noise(orig, noise_level):
if random.random() >= noise_level: if random.random() >= noise_level:
return orig return orig

View File

@ -27,6 +27,10 @@ class GermanDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"} resources = {"lemma_lookup": "lemma_lookup.json"}
single_orth_variants = [{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]}]
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("", "")]},
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("", "")]}]
class German(Language): class German(Language):

View File

@ -38,6 +38,10 @@ class EnglishDefaults(Language.Defaults):
"lemma_index": "lemmatizer/lemma_index.json", "lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json", "lemma_exc": "lemmatizer/lemma_exc.json",
} }
single_orth_variants = [{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]}]
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]}]
class English(Language): class English(Language):

View File

@ -109,6 +109,8 @@ class BaseDefaults(object):
syntax_iterators = {} syntax_iterators = {}
resources = {} resources = {}
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
single_orth_variants = []
paired_orth_variants = []
class Language(object): class Language(object):

View File

@ -192,7 +192,7 @@ browser. Will run a simple web server.
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `True` | | `page` | bool | Render markup as full HTML page. | `True` |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | bool | Minify HTML markup. | `False` |
| `options` | dict | [Visualizer-specific options](#options), e.g. colors. | `{}` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| `port` | int | Port to serve visualization. | `5000` | | `port` | int | Port to serve visualization. | `5000` |
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | | `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization.
| `page` | bool | Render markup as full HTML page. | `False` | | `page` | bool | Render markup as full HTML page. | `False` |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | bool | Minify HTML markup. | `False` |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
| `options` | dict | [Visualizer-specific options](#options), e.g. colors. | `{}` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| **RETURNS** | unicode | Rendered HTML markup. | | **RETURNS** | unicode | Rendered HTML markup. |

View File

@ -9,6 +9,7 @@
"en_core_web_lg", "en_core_web_lg",
"en_vectors_web_lg", "en_vectors_web_lg",
"en_pytt_bertbaseuncased_lg", "en_pytt_bertbaseuncased_lg",
"en_pytt_robertabase_lg",
"en_pytt_xlnetbasecased_lg" "en_pytt_xlnetbasecased_lg"
], ],
"example": "This is a sentence.", "example": "This is a sentence.",

View File

@ -1715,6 +1715,35 @@
"\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)" "\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)"
], ],
"author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram" "author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram"
},
{
"id": "num_fh",
"title": "Numeric Fused-Head",
"slogan": "Numeric Fused-Head Identificaiton and Resolution in English",
"description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.",
"github": "yanaiela/num_fh",
"pip": "num_fh",
"category": ["pipeline", "research"],
"code_example": [
"import spacy",
"from num_fh import NFH",
"nlp = spacy.load('en_core_web_sm')",
"nfh = NFH(nlp)",
"nlp.add_pipe(nfh, first=False)",
"doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")",
"",
"assert doc[16]._.is_nfh == True",
"assert doc[18]._.is_nfh == False",
"assert doc[3]._.is_deter_nfh == True",
"assert doc[16]._.is_deter_nfh == False",
"assert len(doc._.nfh) == 4"
],
"author": "Yanai Elazar",
"author_links": {
"github": "yanaiela",
"twitter": "yanaiela",
"website": "https://yanaiela.github.io"
}
} }
], ],