mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Merge pull request #4205 from adrianeboyd/feature/gold-train-orth-variants
Add train_docs() option to add orth variants
This commit is contained in:
commit
6b2ea883ed
|
@ -19,6 +19,8 @@ It's commercial open-source software, released under the MIT license.
|
|||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
|
||||
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square)](https://pypi.org/project/spacy/)
|
||||
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
|
||||
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
|
||||
|
||||
|
|
103
spacy/gold.pyx
103
spacy/gold.pyx
|
@ -7,6 +7,7 @@ import random
|
|||
import numpy
|
||||
import tempfile
|
||||
import shutil
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
||||
|
@ -206,13 +207,14 @@ class GoldCorpus(object):
|
|||
return n
|
||||
|
||||
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
||||
noise_level=0.0):
|
||||
noise_level=0.0, orth_variant_level=0.0):
|
||||
locs = list((self.tmp_dir / 'train').iterdir())
|
||||
random.shuffle(locs)
|
||||
train_tuples = self.read_tuples(locs, limit=self.limit)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
noise_level=noise_level,
|
||||
orth_variant_level=orth_variant_level,
|
||||
make_projective=True)
|
||||
yield from gold_docs
|
||||
|
||||
|
@ -226,27 +228,32 @@ class GoldCorpus(object):
|
|||
|
||||
@classmethod
|
||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||
noise_level=0.0, make_projective=False):
|
||||
noise_level=0.0, orth_variant_level=0.0, make_projective=False):
|
||||
for raw_text, paragraph_tuples in tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc,
|
||||
noise_level=noise_level)
|
||||
docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
|
||||
paragraph_tuples, gold_preproc, noise_level=noise_level,
|
||||
orth_variant_level=orth_variant_level)
|
||||
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
|
||||
for doc, gold in zip(docs, golds):
|
||||
if (not max_length) or len(doc) < max_length:
|
||||
yield doc, gold
|
||||
|
||||
@classmethod
|
||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0):
|
||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
||||
if raw_text is not None:
|
||||
raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
|
||||
raw_text = add_noise(raw_text, noise_level)
|
||||
return [nlp.make_doc(raw_text)]
|
||||
return [nlp.make_doc(raw_text)], paragraph_tuples
|
||||
else:
|
||||
docs = []
|
||||
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
|
||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
|
||||
|
||||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples, make_projective):
|
||||
|
@ -263,6 +270,88 @@ class GoldCorpus(object):
|
|||
in zip(docs, paragraph_tuples)]
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||
if random.random() >= orth_variant_level:
|
||||
return raw, paragraph_tuples
|
||||
ndsv = nlp.Defaults.single_orth_variants
|
||||
ndpv = nlp.Defaults.paired_orth_variants
|
||||
# modify words in paragraph_tuples
|
||||
variant_paragraph_tuples = []
|
||||
for sent_tuples, brackets in paragraph_tuples:
|
||||
ids, words, tags, heads, labels, ner = sent_tuples
|
||||
# single variants
|
||||
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||
for word_idx in range(len(words)):
|
||||
for punct_idx in range(len(ndsv)):
|
||||
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
||||
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
||||
words[word_idx] = punct_choices[punct_idx]
|
||||
# paired variants
|
||||
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||
for word_idx in range(len(words)):
|
||||
for punct_idx in range(len(ndpv)):
|
||||
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
||||
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||
# backup option: random left vs. right from pair
|
||||
pair_idx = random.choice([0, 1])
|
||||
# best option: rely on paired POS tags like `` / ''
|
||||
if len(ndpv[punct_idx]["tags"]) == 2:
|
||||
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
||||
# next best option: rely on position in variants
|
||||
# (may not be unambiguous, so order of variants matters)
|
||||
else:
|
||||
for pair in ndpv[punct_idx]["variants"]:
|
||||
if words[word_idx] in pair:
|
||||
pair_idx = pair.index(words[word_idx])
|
||||
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||
|
||||
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
|
||||
# modify raw to match variant_paragraph_tuples
|
||||
if raw is not None:
|
||||
variants = []
|
||||
for single_variants in ndsv:
|
||||
variants.extend(single_variants["variants"])
|
||||
for paired_variants in ndpv:
|
||||
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
|
||||
# store variants in reverse length order to be able to prioritize
|
||||
# longer matches (e.g., "---" before "--")
|
||||
variants = sorted(variants, key=lambda x: len(x))
|
||||
variants.reverse()
|
||||
variant_raw = ""
|
||||
raw_idx = 0
|
||||
# add initial whitespace
|
||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||
variant_raw += raw[raw_idx]
|
||||
raw_idx += 1
|
||||
for sent_tuples, brackets in variant_paragraph_tuples:
|
||||
ids, words, tags, heads, labels, ner = sent_tuples
|
||||
for word in words:
|
||||
match_found = False
|
||||
# add identical word
|
||||
if word not in variants and raw[raw_idx:].startswith(word):
|
||||
variant_raw += word
|
||||
raw_idx += len(word)
|
||||
match_found = True
|
||||
# add variant word
|
||||
else:
|
||||
for variant in variants:
|
||||
if not match_found and \
|
||||
raw[raw_idx:].startswith(variant):
|
||||
raw_idx += len(variant)
|
||||
variant_raw += word
|
||||
match_found = True
|
||||
# something went wrong, abort
|
||||
# (add a warning message?)
|
||||
if not match_found:
|
||||
return raw, paragraph_tuples
|
||||
# add following whitespace
|
||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||
variant_raw += raw[raw_idx]
|
||||
raw_idx += 1
|
||||
return variant_raw, variant_paragraph_tuples
|
||||
return raw, variant_paragraph_tuples
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return orig
|
||||
|
|
|
@ -27,6 +27,10 @@ class GermanDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]},
|
||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
||||
paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘")]},
|
||||
{"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“")]}]
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
|
@ -38,6 +38,10 @@ class EnglishDefaults(Language.Defaults):
|
|||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
}
|
||||
single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}]
|
||||
paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}]
|
||||
|
||||
|
||||
class English(Language):
|
||||
|
|
|
@ -109,6 +109,8 @@ class BaseDefaults(object):
|
|||
syntax_iterators = {}
|
||||
resources = {}
|
||||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||
single_orth_variants = []
|
||||
paired_orth_variants = []
|
||||
|
||||
|
||||
class Language(object):
|
||||
|
|
|
@ -192,7 +192,7 @@ browser. Will run a simple web server.
|
|||
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `page` | bool | Render markup as full HTML page. | `True` |
|
||||
| `minify` | bool | Minify HTML markup. | `False` |
|
||||
| `options` | dict | [Visualizer-specific options](#options), e.g. colors. | `{}` |
|
||||
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||
| `port` | int | Port to serve visualization. | `5000` |
|
||||
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
|
||||
|
@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization.
|
|||
| `page` | bool | Render markup as full HTML page. | `False` |
|
||||
| `minify` | bool | Minify HTML markup. | `False` |
|
||||
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
|
||||
| `options` | dict | [Visualizer-specific options](#options), e.g. colors. | `{}` |
|
||||
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||
| **RETURNS** | unicode | Rendered HTML markup. |
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
"en_core_web_lg",
|
||||
"en_vectors_web_lg",
|
||||
"en_pytt_bertbaseuncased_lg",
|
||||
"en_pytt_robertabase_lg",
|
||||
"en_pytt_xlnetbasecased_lg"
|
||||
],
|
||||
"example": "This is a sentence.",
|
||||
|
|
|
@ -1715,6 +1715,35 @@
|
|||
"\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)"
|
||||
],
|
||||
"author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram"
|
||||
},
|
||||
{
|
||||
"id": "num_fh",
|
||||
"title": "Numeric Fused-Head",
|
||||
"slogan": "Numeric Fused-Head Identificaiton and Resolution in English",
|
||||
"description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.",
|
||||
"github": "yanaiela/num_fh",
|
||||
"pip": "num_fh",
|
||||
"category": ["pipeline", "research"],
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"from num_fh import NFH",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"nfh = NFH(nlp)",
|
||||
"nlp.add_pipe(nfh, first=False)",
|
||||
"doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")",
|
||||
"",
|
||||
"assert doc[16]._.is_nfh == True",
|
||||
"assert doc[18]._.is_nfh == False",
|
||||
"assert doc[3]._.is_deter_nfh == True",
|
||||
"assert doc[16]._.is_deter_nfh == False",
|
||||
"assert len(doc._.nfh) == 4"
|
||||
],
|
||||
"author": "Yanai Elazar",
|
||||
"author_links": {
|
||||
"github": "yanaiela",
|
||||
"twitter": "yanaiela",
|
||||
"website": "https://yanaiela.github.io"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user