mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
Merge pull request #4205 from adrianeboyd/feature/gold-train-orth-variants
Add train_docs() option to add orth variants
This commit is contained in:
commit
6b2ea883ed
|
@ -19,6 +19,8 @@ It's commercial open-source software, released under the MIT license.
|
||||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
|
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
|
||||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||||
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
|
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
|
||||||
|
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square)](https://pypi.org/project/spacy/)
|
||||||
|
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||||
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
|
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
|
||||||
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
|
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
|
||||||
|
|
||||||
|
|
103
spacy/gold.pyx
103
spacy/gold.pyx
|
@ -7,6 +7,7 @@ import random
|
||||||
import numpy
|
import numpy
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
|
import itertools
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -206,13 +207,14 @@ class GoldCorpus(object):
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
||||||
noise_level=0.0):
|
noise_level=0.0, orth_variant_level=0.0):
|
||||||
locs = list((self.tmp_dir / 'train').iterdir())
|
locs = list((self.tmp_dir / 'train').iterdir())
|
||||||
random.shuffle(locs)
|
random.shuffle(locs)
|
||||||
train_tuples = self.read_tuples(locs, limit=self.limit)
|
train_tuples = self.read_tuples(locs, limit=self.limit)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
noise_level=noise_level,
|
noise_level=noise_level,
|
||||||
|
orth_variant_level=orth_variant_level,
|
||||||
make_projective=True)
|
make_projective=True)
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
|
@ -226,27 +228,32 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||||
noise_level=0.0, make_projective=False):
|
noise_level=0.0, orth_variant_level=0.0, make_projective=False):
|
||||||
for raw_text, paragraph_tuples in tuples:
|
for raw_text, paragraph_tuples in tuples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
raw_text = None
|
raw_text = None
|
||||||
else:
|
else:
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc,
|
docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
|
||||||
noise_level=noise_level)
|
paragraph_tuples, gold_preproc, noise_level=noise_level,
|
||||||
|
orth_variant_level=orth_variant_level)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
|
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
if (not max_length) or len(doc) < max_length:
|
if (not max_length) or len(doc) < max_length:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0):
|
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
||||||
if raw_text is not None:
|
if raw_text is not None:
|
||||||
|
raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
|
||||||
raw_text = add_noise(raw_text, noise_level)
|
raw_text = add_noise(raw_text, noise_level)
|
||||||
return [nlp.make_doc(raw_text)]
|
return [nlp.make_doc(raw_text)], paragraph_tuples
|
||||||
else:
|
else:
|
||||||
|
docs = []
|
||||||
|
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
|
||||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
||||||
for (sent_tuples, brackets) in paragraph_tuples]
|
for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, docs, paragraph_tuples, make_projective):
|
def _make_golds(cls, docs, paragraph_tuples, make_projective):
|
||||||
|
@ -263,6 +270,88 @@ class GoldCorpus(object):
|
||||||
in zip(docs, paragraph_tuples)]
|
in zip(docs, paragraph_tuples)]
|
||||||
|
|
||||||
|
|
||||||
|
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
|
if random.random() >= orth_variant_level:
|
||||||
|
return raw, paragraph_tuples
|
||||||
|
ndsv = nlp.Defaults.single_orth_variants
|
||||||
|
ndpv = nlp.Defaults.paired_orth_variants
|
||||||
|
# modify words in paragraph_tuples
|
||||||
|
variant_paragraph_tuples = []
|
||||||
|
for sent_tuples, brackets in paragraph_tuples:
|
||||||
|
ids, words, tags, heads, labels, ner = sent_tuples
|
||||||
|
# single variants
|
||||||
|
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||||
|
for word_idx in range(len(words)):
|
||||||
|
for punct_idx in range(len(ndsv)):
|
||||||
|
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
||||||
|
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
||||||
|
words[word_idx] = punct_choices[punct_idx]
|
||||||
|
# paired variants
|
||||||
|
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||||
|
for word_idx in range(len(words)):
|
||||||
|
for punct_idx in range(len(ndpv)):
|
||||||
|
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
||||||
|
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||||
|
# backup option: random left vs. right from pair
|
||||||
|
pair_idx = random.choice([0, 1])
|
||||||
|
# best option: rely on paired POS tags like `` / ''
|
||||||
|
if len(ndpv[punct_idx]["tags"]) == 2:
|
||||||
|
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
||||||
|
# next best option: rely on position in variants
|
||||||
|
# (may not be unambiguous, so order of variants matters)
|
||||||
|
else:
|
||||||
|
for pair in ndpv[punct_idx]["variants"]:
|
||||||
|
if words[word_idx] in pair:
|
||||||
|
pair_idx = pair.index(words[word_idx])
|
||||||
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||||
|
|
||||||
|
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
|
||||||
|
# modify raw to match variant_paragraph_tuples
|
||||||
|
if raw is not None:
|
||||||
|
variants = []
|
||||||
|
for single_variants in ndsv:
|
||||||
|
variants.extend(single_variants["variants"])
|
||||||
|
for paired_variants in ndpv:
|
||||||
|
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
|
||||||
|
# store variants in reverse length order to be able to prioritize
|
||||||
|
# longer matches (e.g., "---" before "--")
|
||||||
|
variants = sorted(variants, key=lambda x: len(x))
|
||||||
|
variants.reverse()
|
||||||
|
variant_raw = ""
|
||||||
|
raw_idx = 0
|
||||||
|
# add initial whitespace
|
||||||
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
|
variant_raw += raw[raw_idx]
|
||||||
|
raw_idx += 1
|
||||||
|
for sent_tuples, brackets in variant_paragraph_tuples:
|
||||||
|
ids, words, tags, heads, labels, ner = sent_tuples
|
||||||
|
for word in words:
|
||||||
|
match_found = False
|
||||||
|
# add identical word
|
||||||
|
if word not in variants and raw[raw_idx:].startswith(word):
|
||||||
|
variant_raw += word
|
||||||
|
raw_idx += len(word)
|
||||||
|
match_found = True
|
||||||
|
# add variant word
|
||||||
|
else:
|
||||||
|
for variant in variants:
|
||||||
|
if not match_found and \
|
||||||
|
raw[raw_idx:].startswith(variant):
|
||||||
|
raw_idx += len(variant)
|
||||||
|
variant_raw += word
|
||||||
|
match_found = True
|
||||||
|
# something went wrong, abort
|
||||||
|
# (add a warning message?)
|
||||||
|
if not match_found:
|
||||||
|
return raw, paragraph_tuples
|
||||||
|
# add following whitespace
|
||||||
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
|
variant_raw += raw[raw_idx]
|
||||||
|
raw_idx += 1
|
||||||
|
return variant_raw, variant_paragraph_tuples
|
||||||
|
return raw, variant_paragraph_tuples
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
def add_noise(orig, noise_level):
|
||||||
if random.random() >= noise_level:
|
if random.random() >= noise_level:
|
||||||
return orig
|
return orig
|
||||||
|
|
|
@ -27,6 +27,10 @@ class GermanDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||||
|
single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]},
|
||||||
|
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
||||||
|
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘")]},
|
||||||
|
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“")]}]
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
|
|
@ -38,6 +38,10 @@ class EnglishDefaults(Language.Defaults):
|
||||||
"lemma_index": "lemmatizer/lemma_index.json",
|
"lemma_index": "lemmatizer/lemma_index.json",
|
||||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||||
}
|
}
|
||||||
|
single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||||
|
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
||||||
|
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
||||||
|
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}]
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
|
|
@ -109,6 +109,8 @@ class BaseDefaults(object):
|
||||||
syntax_iterators = {}
|
syntax_iterators = {}
|
||||||
resources = {}
|
resources = {}
|
||||||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||||
|
single_orth_variants = []
|
||||||
|
paired_orth_variants = []
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
|
|
|
@ -192,7 +192,7 @@ browser. Will run a simple web server.
|
||||||
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||||
| `page` | bool | Render markup as full HTML page. | `True` |
|
| `page` | bool | Render markup as full HTML page. | `True` |
|
||||||
| `minify` | bool | Minify HTML markup. | `False` |
|
| `minify` | bool | Minify HTML markup. | `False` |
|
||||||
| `options` | dict | [Visualizer-specific options](#options), e.g. colors. | `{}` |
|
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||||
| `port` | int | Port to serve visualization. | `5000` |
|
| `port` | int | Port to serve visualization. | `5000` |
|
||||||
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
|
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
|
||||||
|
@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization.
|
||||||
| `page` | bool | Render markup as full HTML page. | `False` |
|
| `page` | bool | Render markup as full HTML page. | `False` |
|
||||||
| `minify` | bool | Minify HTML markup. | `False` |
|
| `minify` | bool | Minify HTML markup. | `False` |
|
||||||
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
|
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
|
||||||
| `options` | dict | [Visualizer-specific options](#options), e.g. colors. | `{}` |
|
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||||
| **RETURNS** | unicode | Rendered HTML markup. |
|
| **RETURNS** | unicode | Rendered HTML markup. |
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
"en_core_web_lg",
|
"en_core_web_lg",
|
||||||
"en_vectors_web_lg",
|
"en_vectors_web_lg",
|
||||||
"en_pytt_bertbaseuncased_lg",
|
"en_pytt_bertbaseuncased_lg",
|
||||||
|
"en_pytt_robertabase_lg",
|
||||||
"en_pytt_xlnetbasecased_lg"
|
"en_pytt_xlnetbasecased_lg"
|
||||||
],
|
],
|
||||||
"example": "This is a sentence.",
|
"example": "This is a sentence.",
|
||||||
|
|
|
@ -1715,6 +1715,35 @@
|
||||||
"\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)"
|
"\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)"
|
||||||
],
|
],
|
||||||
"author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram"
|
"author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "num_fh",
|
||||||
|
"title": "Numeric Fused-Head",
|
||||||
|
"slogan": "Numeric Fused-Head Identificaiton and Resolution in English",
|
||||||
|
"description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.",
|
||||||
|
"github": "yanaiela/num_fh",
|
||||||
|
"pip": "num_fh",
|
||||||
|
"category": ["pipeline", "research"],
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"from num_fh import NFH",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"nfh = NFH(nlp)",
|
||||||
|
"nlp.add_pipe(nfh, first=False)",
|
||||||
|
"doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")",
|
||||||
|
"",
|
||||||
|
"assert doc[16]._.is_nfh == True",
|
||||||
|
"assert doc[18]._.is_nfh == False",
|
||||||
|
"assert doc[3]._.is_deter_nfh == True",
|
||||||
|
"assert doc[16]._.is_deter_nfh == False",
|
||||||
|
"assert len(doc._.nfh) == 4"
|
||||||
|
],
|
||||||
|
"author": "Yanai Elazar",
|
||||||
|
"author_links": {
|
||||||
|
"github": "yanaiela",
|
||||||
|
"twitter": "yanaiela",
|
||||||
|
"website": "https://yanaiela.github.io"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user