From b37fdbb61353a86698f91a728cbf6b23dc22c86b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 13:28:42 +0200 Subject: [PATCH] Clean out /examples and /bin --- bin/__init__.py | 0 bin/load_reddit.py | 97 -- bin/train_word_vectors.py | 81 -- bin/ud/__init__.py | 2 - bin/ud/conll17_ud_eval.py | 614 ----------- bin/ud/run_eval.py | 297 ------ bin/ud/ud_run_test.py | 324 ------ bin/ud/ud_train.py | 559 ---------- examples/README.md | 19 - examples/deep_learning_keras.py | 266 ----- .../entity_relations.py | 82 -- .../information_extraction/parse_subtrees.py | 67 -- .../information_extraction/phrase_matcher.py | 112 -- examples/keras_parikh_entailment/README.md | 114 --- examples/keras_parikh_entailment/__main__.py | 207 ---- .../keras_decomposable_attention.py | 152 --- .../keras_parikh_entailment/spacy_hook.py | 77 -- examples/load_from_docbin.py | 45 - .../notebooks/Decompositional Attention.ipynb | 955 ------------------ examples/pipeline/custom_attr_methods.py | 78 -- .../custom_component_countries_api.py | 130 --- .../pipeline/custom_component_entities.py | 115 --- .../pipeline/custom_sentence_segmentation.py | 61 -- examples/pipeline/fix_space_entities.py | 37 - examples/pipeline/multi_processing.py | 85 -- examples/streamlit_spacy.py | 165 --- examples/training/conllu-config.json | 1 - examples/training/conllu.py | 404 -------- examples/training/create_kb.py | 114 --- examples/training/ner_multitask_objective.py | 88 -- examples/training/rehearsal.py | 96 -- examples/training/train_entity_linker.py | 177 ---- examples/training/train_intent_parser.py | 195 ---- examples/training/train_morphologizer.py | 136 --- examples/training/train_ner.py | 118 --- examples/training/train_new_entity_type.py | 143 --- examples/training/train_parser.py | 110 -- examples/training/train_tagger.py | 105 -- examples/training/train_textcat.py | 192 ---- examples/training/train_textcat_config.cfg | 14 - examples/vectors_fast_text.py | 49 - examples/vectors_tensorboard.py | 105 -- .../example_data}/ner_example_data/README.md | 0 .../ner_example_data/ner-sent-per-line.iob | 0 .../ner_example_data/ner-sent-per-line.json | 0 .../ner-token-per-line-conll2003.iob | 0 .../ner-token-per-line-conll2003.json | 0 .../ner-token-per-line-with-pos.iob | 0 .../ner-token-per-line-with-pos.json | 0 .../ner_example_data/ner-token-per-line.iob | 0 .../ner_example_data/ner-token-per-line.json | 0 .../textcat_example_data/CC0.txt | 0 .../textcat_example_data/CC_BY-SA-3.0.txt | 0 .../textcat_example_data/CC_BY-SA-4.0.txt | 0 .../textcat_example_data/README.md | 0 .../textcat_example_data/cooking.json | 0 .../textcat_example_data/cooking.jsonl | 0 .../jigsaw-toxic-comment.json | 0 .../jigsaw-toxic-comment.jsonl | 0 .../textcatjsonl_to_trainjson.py | 0 .../example_data}/training-data.json | 0 .../example_data}/vocab-data.jsonl | 0 .../experiments/onto-joint/defaults.cfg | 0 .../experiments/onto-joint/pretrain.cfg | 0 {examples => extra}/experiments/onto-ner.cfg | 0 .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 0 .../ptb-joint-pos-dep/defaults.cfg | 0 .../tok2vec-ner/charembed_tok2vec.cfg | 0 .../tok2vec-ner/multihashembed_tok2vec.cfg | 0 69 files changed, 6788 deletions(-) delete mode 100644 bin/__init__.py delete mode 100644 bin/load_reddit.py delete mode 100644 bin/train_word_vectors.py delete mode 100644 bin/ud/__init__.py delete mode 100644 bin/ud/conll17_ud_eval.py delete mode 100644 bin/ud/run_eval.py delete mode 100644 bin/ud/ud_run_test.py delete mode 100644 bin/ud/ud_train.py delete mode 100644 examples/README.md delete mode 100644 examples/deep_learning_keras.py delete mode 100644 examples/information_extraction/entity_relations.py delete mode 100644 examples/information_extraction/parse_subtrees.py delete mode 100644 examples/information_extraction/phrase_matcher.py delete mode 100644 examples/keras_parikh_entailment/README.md delete mode 100644 examples/keras_parikh_entailment/__main__.py delete mode 100644 examples/keras_parikh_entailment/keras_decomposable_attention.py delete mode 100644 examples/keras_parikh_entailment/spacy_hook.py delete mode 100644 examples/load_from_docbin.py delete mode 100644 examples/notebooks/Decompositional Attention.ipynb delete mode 100644 examples/pipeline/custom_attr_methods.py delete mode 100644 examples/pipeline/custom_component_countries_api.py delete mode 100644 examples/pipeline/custom_component_entities.py delete mode 100644 examples/pipeline/custom_sentence_segmentation.py delete mode 100644 examples/pipeline/fix_space_entities.py delete mode 100644 examples/pipeline/multi_processing.py delete mode 100644 examples/streamlit_spacy.py delete mode 100644 examples/training/conllu-config.json delete mode 100644 examples/training/conllu.py delete mode 100644 examples/training/create_kb.py delete mode 100644 examples/training/ner_multitask_objective.py delete mode 100644 examples/training/rehearsal.py delete mode 100644 examples/training/train_entity_linker.py delete mode 100644 examples/training/train_intent_parser.py delete mode 100644 examples/training/train_morphologizer.py delete mode 100644 examples/training/train_ner.py delete mode 100644 examples/training/train_new_entity_type.py delete mode 100644 examples/training/train_parser.py delete mode 100644 examples/training/train_tagger.py delete mode 100644 examples/training/train_textcat.py delete mode 100644 examples/training/train_textcat_config.cfg delete mode 100644 examples/vectors_fast_text.py delete mode 100644 examples/vectors_tensorboard.py rename {examples/training => extra/example_data}/ner_example_data/README.md (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-sent-per-line.iob (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-sent-per-line.json (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-token-per-line-conll2003.iob (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-token-per-line-conll2003.json (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-token-per-line-with-pos.iob (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-token-per-line-with-pos.json (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-token-per-line.iob (100%) rename {examples/training => extra/example_data}/ner_example_data/ner-token-per-line.json (100%) rename {examples/training => extra/example_data}/textcat_example_data/CC0.txt (100%) rename {examples/training => extra/example_data}/textcat_example_data/CC_BY-SA-3.0.txt (100%) rename {examples/training => extra/example_data}/textcat_example_data/CC_BY-SA-4.0.txt (100%) rename {examples/training => extra/example_data}/textcat_example_data/README.md (100%) rename {examples/training => extra/example_data}/textcat_example_data/cooking.json (100%) rename {examples/training => extra/example_data}/textcat_example_data/cooking.jsonl (100%) rename {examples/training => extra/example_data}/textcat_example_data/jigsaw-toxic-comment.json (100%) rename {examples/training => extra/example_data}/textcat_example_data/jigsaw-toxic-comment.jsonl (100%) rename {examples/training => extra/example_data}/textcat_example_data/textcatjsonl_to_trainjson.py (100%) rename {examples/training => extra/example_data}/training-data.json (100%) rename {examples/training => extra/example_data}/vocab-data.jsonl (100%) rename {examples => extra}/experiments/onto-joint/defaults.cfg (100%) rename {examples => extra}/experiments/onto-joint/pretrain.cfg (100%) rename {examples => extra}/experiments/onto-ner.cfg (100%) rename {examples => extra}/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg (100%) rename {examples => extra}/experiments/ptb-joint-pos-dep/defaults.cfg (100%) rename {examples => extra}/experiments/tok2vec-ner/charembed_tok2vec.cfg (100%) rename {examples => extra}/experiments/tok2vec-ner/multihashembed_tok2vec.cfg (100%) diff --git a/bin/__init__.py b/bin/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/bin/load_reddit.py b/bin/load_reddit.py deleted file mode 100644 index afddd3798..000000000 --- a/bin/load_reddit.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import bz2 -import re -import srsly -import sys -import random -import datetime -import plac -from pathlib import Path - -_unset = object() - - -class Reddit(object): - """Stream cleaned comments from Reddit.""" - - pre_format_re = re.compile(r"^[`*~]") - post_format_re = re.compile(r"[`*~]$") - url_re = re.compile(r"\[([^]]+)\]\(%%URL\)") - link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)") - - def __init__(self, file_path, meta_keys={"subreddit": "section"}): - """ - file_path (unicode / Path): Path to archive or directory of archives. - meta_keys (dict): Meta data key included in the Reddit corpus, mapped - to display name in Prodigy meta. - RETURNS (Reddit): The Reddit loader. - """ - self.meta = meta_keys - file_path = Path(file_path) - if not file_path.exists(): - raise IOError("Can't find file path: {}".format(file_path)) - if not file_path.is_dir(): - self.files = [file_path] - else: - self.files = list(file_path.iterdir()) - - def __iter__(self): - for file_path in self.iter_files(): - with bz2.open(str(file_path)) as f: - for line in f: - line = line.strip() - if not line: - continue - comment = srsly.json_loads(line) - if self.is_valid(comment): - text = self.strip_tags(comment["body"]) - yield {"text": text} - - def get_meta(self, item): - return {name: item.get(key, "n/a") for key, name in self.meta.items()} - - def iter_files(self): - for file_path in self.files: - yield file_path - - def strip_tags(self, text): - text = self.link_re.sub(r"\1", text) - text = text.replace(">", ">").replace("<", "<") - text = self.pre_format_re.sub("", text) - text = self.post_format_re.sub("", text) - text = re.sub(r"\s+", " ", text) - return text.strip() - - def is_valid(self, comment): - return ( - comment["body"] is not None - and comment["body"] != "[deleted]" - and comment["body"] != "[removed]" - ) - - -def main(path): - reddit = Reddit(path) - for comment in reddit: - print(srsly.json_dumps(comment)) - - -if __name__ == "__main__": - import socket - - try: - BrokenPipeError - except NameError: - BrokenPipeError = socket.error - try: - plac.call(main) - except BrokenPipeError: - import os, sys - - # Python flushes standard streams on exit; redirect remaining output - # to devnull to avoid another BrokenPipeError at shutdown - devnull = os.open(os.devnull, os.O_WRONLY) - os.dup2(devnull, sys.stdout.fileno()) - sys.exit(1) # Python exits with error code 1 on EPIPE diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py deleted file mode 100644 index 663ce060d..000000000 --- a/bin/train_word_vectors.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function, unicode_literals, division - -import logging -from pathlib import Path -from collections import defaultdict -from gensim.models import Word2Vec -import plac -import spacy - -logger = logging.getLogger(__name__) - - -class Corpus(object): - def __init__(self, directory, nlp): - self.directory = directory - self.nlp = nlp - - def __iter__(self): - for text_loc in iter_dir(self.directory): - with text_loc.open("r", encoding="utf-8") as file_: - text = file_.read() - - # This is to keep the input to the blank model (which doesn't - # sentencize) from being too long. It works particularly well with - # the output of [WikiExtractor](https://github.com/attardi/wikiextractor) - paragraphs = text.split('\n\n') - for par in paragraphs: - yield [word.orth_ for word in self.nlp(par)] - - -def iter_dir(loc): - dir_path = Path(loc) - for fn_path in dir_path.iterdir(): - if fn_path.is_dir(): - for sub_path in fn_path.iterdir(): - yield sub_path - else: - yield fn_path - - -@plac.annotations( - lang=("ISO language code"), - in_dir=("Location of input directory"), - out_loc=("Location of output file"), - n_workers=("Number of workers", "option", "n", int), - size=("Dimension of the word vectors", "option", "d", int), - window=("Context window size", "option", "w", int), - min_count=("Min count", "option", "m", int), - negative=("Number of negative samples", "option", "g", int), - nr_iter=("Number of iterations", "option", "i", int), -) -def main( - lang, - in_dir, - out_loc, - negative=5, - n_workers=4, - window=5, - size=128, - min_count=10, - nr_iter=5, -): - logging.basicConfig( - format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO - ) - nlp = spacy.blank(lang) - corpus = Corpus(in_dir, nlp) - model = Word2Vec( - sentences=corpus, - size=size, - window=window, - min_count=min_count, - workers=n_workers, - sample=1e-5, - negative=negative, - ) - model.save(out_loc) - -if __name__ == "__main__": - plac.call(main) diff --git a/bin/ud/__init__.py b/bin/ud/__init__.py deleted file mode 100644 index 119c46ba4..000000000 --- a/bin/ud/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .conll17_ud_eval import main as ud_evaluate # noqa: F401 -from .ud_train import main as ud_train # noqa: F401 diff --git a/bin/ud/conll17_ud_eval.py b/bin/ud/conll17_ud_eval.py deleted file mode 100644 index 88acfabac..000000000 --- a/bin/ud/conll17_ud_eval.py +++ /dev/null @@ -1,614 +0,0 @@ -#!/usr/bin/env python -# flake8: noqa - -# CoNLL 2017 UD Parsing evaluation script. -# -# Compatible with Python 2.7 and 3.2+, can be used either as a module -# or a standalone executable. -# -# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL), -# Faculty of Mathematics and Physics, Charles University, Czech Republic. -# -# Changelog: -# - [02 Jan 2017] Version 0.9: Initial release -# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation -# - [10 Mar 2017] Version 1.0: Add documentation and test -# Compare HEADs correctly using aligned words -# Allow evaluation with errorneous spaces in forms -# Compare forms in LCS case insensitively -# Detect cycles and multiple root nodes -# Compute AlignedAccuracy - -# Command line usage -# ------------------ -# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file -# -# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics -# is printed -# - if -v is given, several metrics are printed (as precision, recall, F1 score, -# and in case the metric is computed on aligned words also accuracy on these): -# - Tokens: how well do the gold tokens match system tokens -# - Sentences: how well do the gold sentences match system sentences -# - Words: how well can the gold words be aligned to system words -# - UPOS: using aligned words, how well does UPOS match -# - XPOS: using aligned words, how well does XPOS match -# - Feats: using aligned words, how well does FEATS match -# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match -# - Lemmas: using aligned words, how well does LEMMA match -# - UAS: using aligned words, how well does HEAD match -# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match -# - if weights_file is given (with lines containing deprel-weight pairs), -# one more metric is shown: -# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight - -# API usage -# --------- -# - load_conllu(file) -# - loads CoNLL-U file from given file object to an internal representation -# - the file object should return str on both Python 2 and Python 3 -# - raises UDError exception if the given file cannot be loaded -# - evaluate(gold_ud, system_ud) -# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) -# - raises UDError if the concatenated tokens of gold and system file do not match -# - returns a dictionary with the metrics described above, each metrics having -# four fields: precision, recall, f1 and aligned_accuracy (when using aligned -# words, otherwise this is None) - -# Description of token matching -# ----------------------------- -# In order to match tokens of gold file and system file, we consider the text -# resulting from concatenation of gold tokens and text resulting from -# concatenation of system tokens. These texts should match -- if they do not, -# the evaluation fails. -# -# If the texts do match, every token is represented as a range in this original -# text, and tokens are equal only if their range is the same. - -# Description of word matching -# ---------------------------- -# When matching words of gold file and system file, we first match the tokens. -# The words which are also tokens are matched as tokens, but words in multi-word -# tokens have to be handled differently. -# -# To handle multi-word tokens, we start by finding "multi-word spans". -# Multi-word span is a span in the original text such that -# - it contains at least one multi-word token -# - all multi-word tokens in the span (considering both gold and system ones) -# are completely inside the span (i.e., they do not "stick out") -# - the multi-word span is as small as possible -# -# For every multi-word span, we align the gold and system words completely -# inside this span using LCS on their FORMs. The words not intersecting -# (even partially) any multi-word span are then aligned as tokens. - - -from __future__ import division -from __future__ import print_function - -import argparse -import io -import sys -import unittest - -# CoNLL-U column names -ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) - -# UD Error is used when raising exceptions in this module -class UDError(Exception): - pass - -# Load given CoNLL-U file into internal representation -def load_conllu(file, check_parse=True): - # Internal representation classes - class UDRepresentation: - def __init__(self): - # Characters of all the tokens in the whole file. - # Whitespace between tokens is not included. - self.characters = [] - # List of UDSpan instances with start&end indices into `characters`. - self.tokens = [] - # List of UDWord instances. - self.words = [] - # List of UDSpan instances with start&end indices into `characters`. - self.sentences = [] - class UDSpan: - def __init__(self, start, end, characters): - self.start = start - # Note that self.end marks the first position **after the end** of span, - # so we can use characters[start:end] or range(start, end). - self.end = end - self.characters = characters - - @property - def text(self): - return ''.join(self.characters[self.start:self.end]) - - def __str__(self): - return self.text - - def __repr__(self): - return self.text - class UDWord: - def __init__(self, span, columns, is_multiword): - # Span of this word (or MWT, see below) within ud_representation.characters. - self.span = span - # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... - self.columns = columns - # is_multiword==True means that this word is part of a multi-word token. - # In that case, self.span marks the span of the whole multi-word token. - self.is_multiword = is_multiword - # Reference to the UDWord instance representing the HEAD (or None if root). - self.parent = None - # Let's ignore language-specific deprel subtypes. - self.columns[DEPREL] = columns[DEPREL].split(':')[0] - - ud = UDRepresentation() - - # Load the CoNLL-U file - index, sentence_start = 0, None - linenum = 0 - while True: - line = file.readline() - linenum += 1 - if not line: - break - line = line.rstrip("\r\n") - - # Handle sentence start boundaries - if sentence_start is None: - # Skip comments - if line.startswith("#"): - continue - # Start a new sentence - ud.sentences.append(UDSpan(index, 0, ud.characters)) - sentence_start = len(ud.words) - if not line: - # Add parent UDWord links and check there are no cycles - def process_word(word): - if word.parent == "remapping": - raise UDError("There is a cycle in a sentence") - if word.parent is None: - head = int(word.columns[HEAD]) - if head > len(ud.words) - sentence_start: - raise UDError("Line {}: HEAD '{}' points outside of the sentence".format( - linenum, word.columns[HEAD])) - if head: - parent = ud.words[sentence_start + head - 1] - word.parent = "remapping" - process_word(parent) - word.parent = parent - - for word in ud.words[sentence_start:]: - process_word(word) - - # Check there is a single root node - if check_parse: - if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1: - raise UDError("There are multiple roots in a sentence") - - # End the sentence - ud.sentences[-1].end = index - sentence_start = None - continue - - # Read next token/word - columns = line.split("\t") - if len(columns) != 10: - raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line)) - - # Skip empty nodes - if "." in columns[ID]: - continue - - # Delete spaces from FORM so gold.characters == system.characters - # even if one of them tokenizes the space. - columns[FORM] = columns[FORM].replace(" ", "") - if not columns[FORM]: - raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum) - - # Save token - ud.characters.extend(columns[FORM]) - ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters)) - index += len(columns[FORM]) - - # Handle multi-word tokens to save word(s) - if "-" in columns[ID]: - try: - start, end = map(int, columns[ID].split("-")) - except: - raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) - - for _ in range(start, end + 1): - word_line = file.readline().rstrip("\r\n") - word_columns = word_line.split("\t") - if len(word_columns) != 10: - print(columns) - raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line)) - ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) - # Basic tokens/words - else: - try: - word_id = int(columns[ID]) - except: - raise UDError("Cannot parse word ID '{}'".format(columns[ID])) - if word_id != len(ud.words) - sentence_start + 1: - raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) - - try: - head_id = int(columns[HEAD]) - except: - raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) - if head_id < 0: - raise UDError("HEAD cannot be negative") - - ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) - - if sentence_start is not None: - raise UDError("The CoNLL-U file does not end with empty line") - - return ud - -# Evaluate the gold and system treebanks (loaded using load_conllu). -def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True): - class Score: - def __init__(self, gold_total, system_total, correct, aligned_total=None, undersegmented=None, oversegmented=None): - self.precision = correct / system_total if system_total else 0.0 - self.recall = correct / gold_total if gold_total else 0.0 - self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 - self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total - self.undersegmented = undersegmented - self.oversegmented = oversegmented - self.under_perc = len(undersegmented) / gold_total if gold_total and undersegmented else 0.0 - self.over_perc = len(oversegmented) / gold_total if gold_total and oversegmented else 0.0 - class AlignmentWord: - def __init__(self, gold_word, system_word): - self.gold_word = gold_word - self.system_word = system_word - self.gold_parent = None - self.system_parent_gold_aligned = None - class Alignment: - def __init__(self, gold_words, system_words): - self.gold_words = gold_words - self.system_words = system_words - self.matched_words = [] - self.matched_words_map = {} - def append_aligned_words(self, gold_word, system_word): - self.matched_words.append(AlignmentWord(gold_word, system_word)) - self.matched_words_map[system_word] = gold_word - def fill_parents(self): - # We represent root parents in both gold and system data by '0'. - # For gold data, we represent non-root parent by corresponding gold word. - # For system data, we represent non-root parent by either gold word aligned - # to parent system nodes, or by None if no gold words is aligned to the parent. - for words in self.matched_words: - words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0 - words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \ - if words.system_word.parent is not None else 0 - - def lower(text): - if sys.version_info < (3, 0) and isinstance(text, str): - return text.decode("utf-8").lower() - return text.lower() - - def spans_score(gold_spans, system_spans): - correct, gi, si = 0, 0, 0 - undersegmented = [] - oversegmented = [] - combo = 0 - previous_end_si_earlier = False - previous_end_gi_earlier = False - while gi < len(gold_spans) and si < len(system_spans): - previous_si = system_spans[si-1] if si > 0 else None - previous_gi = gold_spans[gi-1] if gi > 0 else None - if system_spans[si].start < gold_spans[gi].start: - # avoid counting the same mistake twice - if not previous_end_si_earlier: - combo += 1 - oversegmented.append(str(previous_gi).strip()) - si += 1 - elif gold_spans[gi].start < system_spans[si].start: - # avoid counting the same mistake twice - if not previous_end_gi_earlier: - combo += 1 - undersegmented.append(str(previous_si).strip()) - gi += 1 - else: - correct += gold_spans[gi].end == system_spans[si].end - if gold_spans[gi].end < system_spans[si].end: - undersegmented.append(str(system_spans[si]).strip()) - previous_end_gi_earlier = True - previous_end_si_earlier = False - elif gold_spans[gi].end > system_spans[si].end: - oversegmented.append(str(gold_spans[gi]).strip()) - previous_end_si_earlier = True - previous_end_gi_earlier = False - else: - previous_end_gi_earlier = False - previous_end_si_earlier = False - si += 1 - gi += 1 - - return Score(len(gold_spans), len(system_spans), correct, None, undersegmented, oversegmented) - - def alignment_score(alignment, key_fn, weight_fn=lambda w: 1): - gold, system, aligned, correct = 0, 0, 0, 0 - - for word in alignment.gold_words: - gold += weight_fn(word) - - for word in alignment.system_words: - system += weight_fn(word) - - for words in alignment.matched_words: - aligned += weight_fn(words.gold_word) - - if key_fn is None: - # Return score for whole aligned words - return Score(gold, system, aligned) - - for words in alignment.matched_words: - if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned): - correct += weight_fn(words.gold_word) - - return Score(gold, system, correct, aligned) - - def beyond_end(words, i, multiword_span_end): - if i >= len(words): - return True - if words[i].is_multiword: - return words[i].span.start >= multiword_span_end - return words[i].span.end > multiword_span_end - - def extend_end(word, multiword_span_end): - if word.is_multiword and word.span.end > multiword_span_end: - return word.span.end - return multiword_span_end - - def find_multiword_span(gold_words, system_words, gi, si): - # We know gold_words[gi].is_multiword or system_words[si].is_multiword. - # Find the start of the multiword span (gs, ss), so the multiword span is minimal. - # Initialize multiword_span_end characters index. - if gold_words[gi].is_multiword: - multiword_span_end = gold_words[gi].span.end - if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: - si += 1 - else: # if system_words[si].is_multiword - multiword_span_end = system_words[si].span.end - if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: - gi += 1 - gs, ss = gi, si - - # Find the end of the multiword span - # (so both gi and si are pointing to the word following the multiword span end). - while not beyond_end(gold_words, gi, multiword_span_end) or \ - not beyond_end(system_words, si, multiword_span_end): - if gi < len(gold_words) and (si >= len(system_words) or - gold_words[gi].span.start <= system_words[si].span.start): - multiword_span_end = extend_end(gold_words[gi], multiword_span_end) - gi += 1 - else: - multiword_span_end = extend_end(system_words[si], multiword_span_end) - si += 1 - return gs, ss, gi, si - - def compute_lcs(gold_words, system_words, gi, si, gs, ss): - lcs = [[0] * (si - ss) for i in range(gi - gs)] - for g in reversed(range(gi - gs)): - for s in reversed(range(si - ss)): - if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): - lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) - lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) - lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) - return lcs - - def align_words(gold_words, system_words): - alignment = Alignment(gold_words, system_words) - - gi, si = 0, 0 - while gi < len(gold_words) and si < len(system_words): - if gold_words[gi].is_multiword or system_words[si].is_multiword: - # A: Multi-word tokens => align via LCS within the whole "multiword span". - gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) - - if si > ss and gi > gs: - lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) - - # Store aligned words - s, g = 0, 0 - while g < gi - gs and s < si - ss: - if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): - alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) - g += 1 - s += 1 - elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): - g += 1 - else: - s += 1 - else: - # B: No multi-word token => align according to spans. - if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): - alignment.append_aligned_words(gold_words[gi], system_words[si]) - gi += 1 - si += 1 - elif gold_words[gi].span.start <= system_words[si].span.start: - gi += 1 - else: - si += 1 - - alignment.fill_parents() - - return alignment - - # Check that underlying character sequences do match - if gold_ud.characters != system_ud.characters: - index = 0 - while gold_ud.characters[index] == system_ud.characters[index]: - index += 1 - - raise UDError( - "The concatenation of tokens in gold file and in system file differ!\n" + - "First 20 differing characters in gold file: '{}' and system file: '{}'".format( - "".join(gold_ud.characters[index:index + 20]), - "".join(system_ud.characters[index:index + 20]) - ) - ) - - # Align words - alignment = align_words(gold_ud.words, system_ud.words) - - # Compute the F1-scores - if check_parse: - result = { - "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), - "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), - "Words": alignment_score(alignment, None), - "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]), - "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]), - "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]), - "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), - "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]), - "UAS": alignment_score(alignment, lambda w, parent: parent), - "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])), - } - else: - result = { - "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), - "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), - "Words": alignment_score(alignment, None), - "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]), - "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]), - } - - - # Add WeightedLAS if weights are given - if deprel_weights is not None: - def weighted_las(word): - return deprel_weights.get(word.columns[DEPREL], 1.0) - result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las) - - return result - -def load_deprel_weights(weights_file): - if weights_file is None: - return None - - deprel_weights = {} - for line in weights_file: - # Ignore comments and empty lines - if line.startswith("#") or not line.strip(): - continue - - columns = line.rstrip("\r\n").split() - if len(columns) != 2: - raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line)) - - deprel_weights[columns[0]] = float(columns[1]) - - return deprel_weights - -def load_conllu_file(path): - _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) - return load_conllu(_file) - -def evaluate_wrapper(args): - # Load CoNLL-U files - gold_ud = load_conllu_file(args.gold_file) - system_ud = load_conllu_file(args.system_file) - - # Load weights if requested - deprel_weights = load_deprel_weights(args.weights) - - return evaluate(gold_ud, system_ud, deprel_weights) - -def main(): - # Parse arguments - parser = argparse.ArgumentParser() - parser.add_argument("gold_file", type=str, - help="Name of the CoNLL-U file with the gold data.") - parser.add_argument("system_file", type=str, - help="Name of the CoNLL-U file with the predicted data.") - parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None, - metavar="deprel_weights_file", - help="Compute WeightedLAS using given weights for Universal Dependency Relations.") - parser.add_argument("--verbose", "-v", default=0, action="count", - help="Print all metrics.") - args = parser.parse_args() - - # Use verbose if weights are supplied - if args.weights is not None and not args.verbose: - args.verbose = 1 - - # Evaluate - evaluation = evaluate_wrapper(args) - - # Print the evaluation - if not args.verbose: - print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) - else: - metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"] - if args.weights is not None: - metrics.append("WeightedLAS") - - print("Metrics | Precision | Recall | F1 Score | AligndAcc") - print("-----------+-----------+-----------+-----------+-----------") - for metric in metrics: - print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( - metric, - 100 * evaluation[metric].precision, - 100 * evaluation[metric].recall, - 100 * evaluation[metric].f1, - "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" - )) - -if __name__ == "__main__": - main() - -# Tests, which can be executed with `python -m unittest conll17_ud_eval`. -class TestAlignment(unittest.TestCase): - @staticmethod - def _load_words(words): - """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" - lines, num_words = [], 0 - for w in words: - parts = w.split(" ") - if len(parts) == 1: - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) - else: - lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) - for part in parts[1:]: - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) - return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) - - def _test_exception(self, gold, system): - self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) - - def _test_ok(self, gold, system, correct): - metrics = evaluate(self._load_words(gold), self._load_words(system)) - gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) - system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) - self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), - (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) - - def test_exception(self): - self._test_exception(["a"], ["b"]) - - def test_equal(self): - self._test_ok(["a"], ["a"], 1) - self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) - - def test_equal_with_multiword(self): - self._test_ok(["abc a b c"], ["a", "b", "c"], 3) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) - self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) - self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) - - def test_alignment(self): - self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) - self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) - self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) - self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) - self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) diff --git a/bin/ud/run_eval.py b/bin/ud/run_eval.py deleted file mode 100644 index 3a30c0ee9..000000000 --- a/bin/ud/run_eval.py +++ /dev/null @@ -1,297 +0,0 @@ -import spacy -import time -import re -import plac -import operator -import datetime -from pathlib import Path -import xml.etree.ElementTree as ET - -import conll17_ud_eval -from ud_train import write_conllu -from spacy.lang.lex_attrs import word_shape -from spacy.util import get_lang_class - -# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later) -ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr," - "ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb," - "nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl," - "tr, tt, uk, ur, vi, yo, zh") - -# Non-parsing tasks that will be evaluated (works for default models) -EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats'] - -# Tasks that will be evaluated if check_parse=True (does not work for default models) -EVAL_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats', 'UPOS', 'XPOS', 'AllTags', 'UAS', 'LAS'] - -# Minimum frequency an error should have to be printed -PRINT_FREQ = 20 - -# Maximum number of errors printed per category -PRINT_TOTAL = 10 - -space_re = re.compile("\s+") - - -def load_model(modelname, add_sentencizer=False): - """ Load a specific spaCy model """ - loading_start = time.time() - nlp = spacy.load(modelname) - if add_sentencizer: - nlp.add_pipe(nlp.create_pipe('sentencizer')) - loading_end = time.time() - loading_time = loading_end - loading_start - if add_sentencizer: - return nlp, loading_time, modelname + '_sentencizer' - return nlp, loading_time, modelname - - -def load_default_model_sentencizer(lang): - """ Load a generic spaCy model and add the sentencizer for sentence tokenization""" - loading_start = time.time() - lang_class = get_lang_class(lang) - nlp = lang_class() - nlp.add_pipe(nlp.create_pipe('sentencizer')) - loading_end = time.time() - loading_time = loading_end - loading_start - return nlp, loading_time, lang + "_default_" + 'sentencizer' - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -def get_freq_tuples(my_list, print_total_threshold): - """ Turn a list of errors into frequency-sorted tuples thresholded by a certain total number """ - d = {} - for token in my_list: - d.setdefault(token, 0) - d[token] += 1 - return sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:print_total_threshold] - - -def _contains_blinded_text(stats_xml): - """ Heuristic to determine whether the treebank has blinded texts or not """ - tree = ET.parse(stats_xml) - root = tree.getroot() - total_tokens = int(root.find('size/total/tokens').text) - unique_forms = int(root.find('forms').get('unique')) - - # assume the corpus is largely blinded when there are less than 1% unique tokens - return (unique_forms / total_tokens) < 0.01 - - -def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language): - """" Fetch the txt files for all treebanks for a given set of languages """ - all_treebanks = dict() - treebank_size = dict() - for l in languages: - all_treebanks[l] = [] - treebank_size[l] = 0 - - for treebank_dir in ud_dir.iterdir(): - if treebank_dir.is_dir(): - for txt_path in treebank_dir.iterdir(): - if txt_path.name.endswith('-ud-' + corpus + '.txt'): - file_lang = txt_path.name.split('_')[0] - if file_lang in languages: - gold_path = treebank_dir / txt_path.name.replace('.txt', '.conllu') - stats_xml = treebank_dir / "stats.xml" - # ignore treebanks where the texts are not publicly available - if not _contains_blinded_text(stats_xml): - if not best_per_language: - all_treebanks[file_lang].append(txt_path) - # check the tokens in the gold annotation to keep only the biggest treebank per language - else: - with gold_path.open(mode='r', encoding='utf-8') as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - gold_tokens = len(gold_ud.tokens) - if treebank_size[file_lang] < gold_tokens: - all_treebanks[file_lang] = [txt_path] - treebank_size[file_lang] = gold_tokens - - return all_treebanks - - -def run_single_eval(nlp, loading_time, print_name, text_path, gold_ud, tmp_output_path, out_file, print_header, - check_parse, print_freq_tasks): - """" Run an evaluation of a model nlp on a certain specified treebank """ - with text_path.open(mode='r', encoding='utf-8') as f: - flat_text = f.read() - - # STEP 1: tokenize text - tokenization_start = time.time() - texts = split_text(flat_text) - docs = list(nlp.pipe(texts)) - tokenization_end = time.time() - tokenization_time = tokenization_end - tokenization_start - - # STEP 2: record stats and timings - tokens_per_s = int(len(gold_ud.tokens) / tokenization_time) - - print_header_1 = ['date', 'text_path', 'gold_tokens', 'model', 'loading_time', 'tokenization_time', 'tokens_per_s'] - print_string_1 = [str(datetime.date.today()), text_path.name, len(gold_ud.tokens), - print_name, "%.2f" % loading_time, "%.2f" % tokenization_time, tokens_per_s] - - # STEP 3: evaluate predicted tokens and features - with tmp_output_path.open(mode="w", encoding="utf8") as tmp_out_file: - write_conllu(docs, tmp_out_file) - with tmp_output_path.open(mode="r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file, check_parse=check_parse) - tmp_output_path.unlink() - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud, check_parse=check_parse) - - # STEP 4: format the scoring results - eval_headers = EVAL_PARSE - if not check_parse: - eval_headers = EVAL_NO_PARSE - - for score_name in eval_headers: - score = scores[score_name] - print_string_1.extend(["%.2f" % score.precision, - "%.2f" % score.recall, - "%.2f" % score.f1]) - print_string_1.append("-" if score.aligned_accuracy is None else "%.2f" % score.aligned_accuracy) - print_string_1.append("-" if score.undersegmented is None else "%.4f" % score.under_perc) - print_string_1.append("-" if score.oversegmented is None else "%.4f" % score.over_perc) - - print_header_1.extend([score_name + '_p', score_name + '_r', score_name + '_F', score_name + '_acc', - score_name + '_under', score_name + '_over']) - - if score_name in print_freq_tasks: - print_header_1.extend([score_name + '_word_under_ex', score_name + '_shape_under_ex', - score_name + '_word_over_ex', score_name + '_shape_over_ex']) - - d_under_words = get_freq_tuples(score.undersegmented, PRINT_TOTAL) - d_under_shapes = get_freq_tuples([word_shape(x) for x in score.undersegmented], PRINT_TOTAL) - d_over_words = get_freq_tuples(score.oversegmented, PRINT_TOTAL) - d_over_shapes = get_freq_tuples([word_shape(x) for x in score.oversegmented], PRINT_TOTAL) - - # saving to CSV with ; seperator so blinding ; in the example output - print_string_1.append( - str({k: v for k, v in d_under_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - print_string_1.append( - str({k: v for k, v in d_under_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - print_string_1.append( - str({k: v for k, v in d_over_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - print_string_1.append( - str({k: v for k, v in d_over_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) - - # STEP 5: print the formatted results to CSV - if print_header: - out_file.write(';'.join(map(str, print_header_1)) + '\n') - out_file.write(';'.join(map(str, print_string_1)) + '\n') - - -def run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks): - """" Run an evaluation for each language with its specified models and treebanks """ - print_header = True - - for tb_lang, treebank_list in treebanks.items(): - print() - print("Language", tb_lang) - for text_path in treebank_list: - print(" Evaluating on", text_path) - - gold_path = text_path.parent / (text_path.stem + '.conllu') - print(" Gold data from ", gold_path) - - # nested try blocks to ensure the code can continue with the next iteration after a failure - try: - with gold_path.open(mode='r', encoding='utf-8') as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - - for nlp, nlp_loading_time, nlp_name in models[tb_lang]: - try: - print(" Benchmarking", nlp_name) - tmp_output_path = text_path.parent / str('tmp_' + nlp_name + '.conllu') - run_single_eval(nlp, nlp_loading_time, nlp_name, text_path, gold_ud, tmp_output_path, out_file, - print_header, check_parse, print_freq_tasks) - print_header = False - except Exception as e: - print(" Ran into trouble: ", str(e)) - except Exception as e: - print(" Ran into trouble: ", str(e)) - - -@plac.annotations( - out_path=("Path to output CSV file", "positional", None, Path), - ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - check_parse=("Set flag to evaluate parsing performance", "flag", "p", bool), - langs=("Enumeration of languages to evaluate (default: all)", "option", "l", str), - exclude_trained_models=("Set flag to exclude trained models", "flag", "t", bool), - exclude_multi=("Set flag to exclude the multi-language model as default baseline", "flag", "m", bool), - hide_freq=("Set flag to avoid printing out more detailed high-freq tokenization errors", "flag", "f", bool), - corpus=("Whether to run on train, dev or test", "option", "c", str), - best_per_language=("Set flag to only keep the largest treebank for each language", "flag", "b", bool) -) -def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_trained_models=False, exclude_multi=False, - hide_freq=False, corpus='train', best_per_language=False): - """" - Assemble all treebanks and models to run evaluations with. - When setting check_parse to True, the default models will not be evaluated as they don't have parsing functionality - """ - languages = [lang.strip() for lang in langs.split(",")] - - print_freq_tasks = [] - if not hide_freq: - print_freq_tasks = ['Tokens'] - - # fetching all relevant treebank from the directory - treebanks = fetch_all_treebanks(ud_dir, languages, corpus, best_per_language) - - print() - print("Loading all relevant models for", languages) - models = dict() - - # multi-lang model - multi = None - if not exclude_multi and not check_parse: - multi = load_model('xx_ent_wiki_sm', add_sentencizer=True) - - # initialize all models with the multi-lang model - for lang in languages: - UD_lang = lang - # Norwegian is 'nb' in spaCy but 'no' in the UD corpora - if lang == "nb": - UD_lang = "no" - try: - models[UD_lang] = [multi] if multi else [] - # add default models if we don't want to evaluate parsing info - if not check_parse: - models[UD_lang].append(load_default_model_sentencizer(lang)) - except: - print(f"Exception initializing lang {lang} - skipping") - - # language-specific trained models - if not exclude_trained_models: - news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"] - news_languages = ["nb"] - web_languages = ["en", "zh"] - sizes = ["sm", "md", "lg"] - for lang in web_languages: - UD_lang = lang - for size in sizes: - model_name = f'{lang}_core_web_{size}' - try: - models[UD_lang].append(load_model(model_name)) - except Exception as e: - print(f"Error loading {model_name}: {e}") - - for lang in news_languages: - UD_lang = lang - if lang == "nb": - UD_lang = "no" - for size in sizes: - model_name = f'{lang}_core_news_{size}' - try: - models[UD_lang].append(load_model(model_name)) - except Exception as e: - print(f"Error loading {model_name}: {e}") - - with out_path.open(mode='w', encoding='utf-8') as out_file: - run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks) - - -if __name__ == "__main__": - plac.call(main) diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py deleted file mode 100644 index 70c6be0d0..000000000 --- a/bin/ud/ud_run_test.py +++ /dev/null @@ -1,324 +0,0 @@ -# flake8: noqa -"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes -.conllu format for development data, allowing the official scorer to be used. -""" -from __future__ import unicode_literals - -import plac -from pathlib import Path -import re -import sys -import srsly - -import spacy -import spacy.util -from spacy.tokens import Token, Doc -from spacy.matcher import Matcher - - -Fused_begin = None -Fused_inside = None - -from . import conll17_ud_eval - -from spacy import lang -from spacy.lang import zh -from spacy.lang import ja -from spacy.lang import ru - - -################ -# Data reading # -################ - -space_re = re.compile(r"\s+") - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -############## -# Evaluation # -############## - - -def read_conllu(file_): - docs = [] - sent = [] - doc = [] - for line in file_: - if line.startswith("# newdoc"): - if doc: - docs.append(doc) - doc = [] - elif line.startswith("#"): - continue - elif not line.strip(): - if sent: - doc.append(sent) - sent = [] - else: - sent.append(list(line.strip().split("\t"))) - if len(sent[-1]) != 10: - print(repr(line)) - raise ValueError - if sent: - doc.append(sent) - if doc: - docs.append(doc) - return docs - - -def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - if text_loc.parts[-1].endswith(".conllu"): - docs = [] - with text_loc.open(encoding="utf8") as file_: - for conllu_doc in read_conllu(file_): - for conllu_sent in conllu_doc: - words = [line[1] for line in conllu_sent] - docs.append(Doc(nlp.vocab, words=words)) - for name, component in nlp.pipeline: - docs = list(component.pipe(docs)) - else: - with text_loc.open("r", encoding="utf8") as text_file: - texts = split_text(text_file.read()) - docs = list(nlp.pipe(texts)) - with sys_loc.open("w", encoding="utf8") as out_file: - write_conllu(docs, out_file) - with gold_loc.open("r", encoding="utf8") as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open("r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file) - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) - return docs, scores - - -def write_conllu(docs, file_): - merger = Matcher(docs[0].vocab) - merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) - for i, doc in enumerate(docs): - matches = [] - if doc.is_parsed: - matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] - with doc.retokenize() as retokenizer: - for span in spans: - retokenizer.merge(span) - file_.write("# newdoc id = {i}\n".format(i=i)) - for j, sent in enumerate(doc.sents): - file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) - file_.write("# text = {text}\n".format(text=sent.text)) - for k, token in enumerate(sent): - file_.write(_get_token_conllu(token, k, len(sent)) + "\n") - file_.write("\n") - for word in sent: - if word.head.i == word.i and word.dep_ == "ROOT": - break - else: - print("Rootless sentence!") - print(sent) - print(i) - for w in sent: - print(w.i, w.text, w.head.text, w.head.i, w.dep_) - raise ValueError - - -def _get_token_conllu(token, k, sent_len): - if token.check_morph(Fused_begin) and (k + 1 < sent_len): - n = 1 - text = [token.text] - while token.nbor(n).check_morph(Fused_inside): - text.append(token.nbor(n).text) - n += 1 - id_ = "%d-%d" % (k + 1, (k + n)) - fields = [id_, "".join(text)] + ["_"] * 8 - lines = ["\t".join(fields)] - else: - lines = [] - if token.head.i == token.i: - head = 0 - else: - head = k + (token.head.i - token.i) + 1 - fields = [ - str(k + 1), - token.text, - token.lemma_, - token.pos_, - token.tag_, - "_", - str(head), - token.dep_.lower(), - "_", - "_", - ] - if token.check_morph(Fused_begin) and (k + 1 < sent_len): - if k == 0: - fields[1] = token.norm_[0].upper() + token.norm_[1:] - else: - fields[1] = token.norm_ - elif token.check_morph(Fused_inside): - fields[1] = token.norm_ - elif token._.split_start is not None: - split_start = token._.split_start - split_end = token._.split_end - split_len = (split_end.i - split_start.i) + 1 - n_in_split = token.i - split_start.i - subtokens = guess_fused_orths(split_start.text, [""] * split_len) - fields[1] = subtokens[n_in_split] - - lines.append("\t".join(fields)) - return "\n".join(lines) - - -def guess_fused_orths(word, ud_forms): - """The UD data 'fused tokens' don't necessarily expand to keys that match - the form. We need orths that exact match the string. Here we make a best - effort to divide up the word.""" - if word == "".join(ud_forms): - # Happy case: we get a perfect split, with each letter accounted for. - return ud_forms - elif len(word) == sum(len(subtoken) for subtoken in ud_forms): - # Unideal, but at least lengths match. - output = [] - remain = word - for subtoken in ud_forms: - assert len(subtoken) >= 1 - output.append(remain[: len(subtoken)]) - remain = remain[len(subtoken) :] - assert len(remain) == 0, (word, ud_forms, remain) - return output - else: - # Let's say word is 6 long, and there are three subtokens. The orths - # *must* equal the original string. Arbitrarily, split [4, 1, 1] - first = word[: len(word) - (len(ud_forms) - 1)] - output = [first] - remain = word[len(first) :] - for i in range(1, len(ud_forms)): - assert remain - output.append(remain[:1]) - remain = remain[1:] - assert len(remain) == 0, (word, output, remain) - return output - - -def print_results(name, ud_scores): - fields = {} - if ud_scores is not None: - fields.update( - { - "words": ud_scores["Words"].f1 * 100, - "sents": ud_scores["Sentences"].f1 * 100, - "tags": ud_scores["XPOS"].f1 * 100, - "uas": ud_scores["UAS"].f1 * 100, - "las": ud_scores["LAS"].f1 * 100, - } - ) - else: - fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0}) - tpl = "\t".join( - (name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}") - ) - print(tpl.format(**fields)) - return fields - - -def get_token_split_start(token): - if token.text == "": - assert token.i != 0 - i = -1 - while token.nbor(i).text == "": - i -= 1 - return token.nbor(i) - elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "": - return token - else: - return None - - -def get_token_split_end(token): - if (token.i + 1) == len(token.doc): - return token if token.text == "" else None - elif token.text != "" and token.nbor(1).text != "": - return None - i = 1 - while (token.i + i) < len(token.doc) and token.nbor(i).text == "": - i += 1 - return token.nbor(i - 1) - - -################## -# Initialization # -################## - - -def load_nlp(experiments_dir, corpus): - nlp = spacy.load(experiments_dir / corpus / "best-model") - return nlp - - -def initialize_pipeline(nlp, examples, config, device): - nlp.add_pipe(nlp.create_pipe("parser")) - return nlp - - -@plac.annotations( - test_data_dir=( - "Path to Universal Dependencies test data", - "positional", - None, - Path, - ), - experiment_dir=("Parent directory with output model", "positional", None, Path), - corpus=( - "UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", - "positional", - None, - str, - ), -) -def main(test_data_dir, experiment_dir, corpus): - Token.set_extension("split_start", getter=get_token_split_start) - Token.set_extension("split_end", getter=get_token_split_end) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - lang.zh.Chinese.Defaults.use_jieba = False - lang.ja.Japanese.Defaults.use_janome = False - lang.ru.Russian.Defaults.use_pymorphy2 = False - - nlp = load_nlp(experiment_dir, corpus) - - treebank_code = nlp.meta["treebank"] - for section in ("test", "dev"): - if section == "dev": - section_dir = "conll17-ud-development-2017-03-19" - else: - section_dir = "conll17-ud-test-2017-05-09" - text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt") - udpipe_path = ( - test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu") - ) - gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu") - - header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"] - print("\t".join(header)) - inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path} - for input_type in ("udp", "raw"): - input_path = inputs[input_type] - output_path = ( - experiment_dir / corpus / "{section}.conllu".format(section=section) - ) - - parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path) - - accuracy = print_results(input_type, test_scores) - acc_path = ( - experiment_dir - / corpus - / "{section}-accuracy.json".format(section=section) - ) - srsly.write_json(acc_path, accuracy) - - -if __name__ == "__main__": - plac.call(main) diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py deleted file mode 100644 index 362057b37..000000000 --- a/bin/ud/ud_train.py +++ /dev/null @@ -1,559 +0,0 @@ -# flake8: noqa -"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes -.conllu format for development data, allowing the official scorer to be used. -""" -from __future__ import unicode_literals - -import plac -from pathlib import Path -import re -import json -import tqdm - -import spacy -import spacy.util -from bin.ud import conll17_ud_eval -from spacy.tokens import Token, Doc -from spacy.gold import Example -from spacy.util import compounding, minibatch -from spacy.gold.batchers import minibatch_by_words -from spacy.pipeline._parser_internals.nonproj import projectivize -from spacy.matcher import Matcher -from spacy import displacy -from collections import defaultdict - -import random - -from spacy import lang -from spacy.lang import zh -from spacy.lang import ja - -try: - import torch -except ImportError: - torch = None - - -################ -# Data reading # -################ - -space_re = re.compile("\s+") - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -def read_data( - nlp, - conllu_file, - text_file, - raw_text=True, - oracle_segments=False, - max_doc_length=None, - limit=None, -): - """Read the CONLLU format into Example objects. If raw_text=True, - include Doc objects created using nlp.make_doc and then aligned against - the gold-standard sequences. If oracle_segments=True, include Doc objects - created from the gold-standard segments. At least one must be True.""" - if not raw_text and not oracle_segments: - raise ValueError("At least one of raw_text or oracle_segments must be True") - paragraphs = split_text(text_file.read()) - conllu = read_conllu(conllu_file) - # sd is spacy doc; cd is conllu doc - # cs is conllu sent, ct is conllu token - docs = [] - golds = [] - for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): - sent_annots = [] - for cs in cd: - sent = defaultdict(list) - for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: - if "." in id_: - continue - if "-" in id_: - continue - id_ = int(id_) - 1 - head = int(head) - 1 if head != "0" else id_ - sent["words"].append(word) - sent["tags"].append(tag) - sent["morphs"].append(_compile_morph_string(morph, pos)) - sent["heads"].append(head) - sent["deps"].append("ROOT" if dep == "root" else dep) - sent["spaces"].append(space_after == "_") - sent["entities"] = ["-"] * len(sent["words"]) # TODO: doc-level format - sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) - if oracle_segments: - docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) - golds.append(sent) - assert golds[-1]["morphs"] is not None - - sent_annots.append(sent) - if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: - doc, gold = _make_gold(nlp, None, sent_annots) - assert gold["morphs"] is not None - sent_annots = [] - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - - if raw_text and sent_annots: - doc, gold = _make_gold(nlp, None, sent_annots) - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - return golds_to_gold_data(docs, golds) - - -def _compile_morph_string(morph_string, pos): - if morph_string == '_': - return f"POS={pos}" - return morph_string + f"|POS={pos}" - - -def read_conllu(file_): - docs = [] - sent = [] - doc = [] - for line in file_: - if line.startswith("# newdoc"): - if doc: - docs.append(doc) - doc = [] - elif line.startswith("#"): - continue - elif not line.strip(): - if sent: - doc.append(sent) - sent = [] - else: - sent.append(list(line.strip().split("\t"))) - if len(sent[-1]) != 10: - print(repr(line)) - raise ValueError - if sent: - doc.append(sent) - if doc: - docs.append(doc) - return docs - - -def _make_gold(nlp, text, sent_annots, drop_deps=0.0): - # Flatten the conll annotations, and adjust the head indices - gold = defaultdict(list) - sent_starts = [] - for sent in sent_annots: - gold["heads"].extend(len(gold["words"])+head for head in sent["heads"]) - for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]: - gold[field].extend(sent[field]) - sent_starts.append(True) - sent_starts.extend([False] * (len(sent["words"]) - 1)) - # Construct text if necessary - assert len(gold["words"]) == len(gold["spaces"]) - if text is None: - text = "".join( - word + " " * space for word, space in zip(gold["words"], gold["spaces"]) - ) - doc = nlp.make_doc(text) - gold.pop("spaces") - gold["sent_starts"] = sent_starts - for i in range(len(gold["heads"])): - if random.random() < drop_deps: - gold["heads"][i] = None - gold["labels"][i] = None - - return doc, gold - - -############################# -# Data transforms for spaCy # -############################# - - -def golds_to_gold_data(docs, golds): - """Get out the training data format used by begin_training""" - data = [] - for doc, gold in zip(docs, golds): - example = Example.from_dict(doc, dict(gold)) - data.append(example) - return data - - -############## -# Evaluation # -############## - - -def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - if text_loc.parts[-1].endswith(".conllu"): - docs = [] - with text_loc.open(encoding="utf8") as file_: - for conllu_doc in read_conllu(file_): - for conllu_sent in conllu_doc: - words = [line[1] for line in conllu_sent] - docs.append(Doc(nlp.vocab, words=words)) - for name, component in nlp.pipeline: - docs = list(component.pipe(docs)) - else: - with text_loc.open("r", encoding="utf8") as text_file: - texts = split_text(text_file.read()) - docs = list(nlp.pipe(texts)) - with sys_loc.open("w", encoding="utf8") as out_file: - write_conllu(docs, out_file) - with gold_loc.open("r", encoding="utf8") as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open("r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file) - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) - return docs, scores - - -def write_conllu(docs, file_): - if not Token.has_extension("get_conllu_lines"): - Token.set_extension("get_conllu_lines", method=get_token_conllu) - if not Token.has_extension("begins_fused"): - Token.set_extension("begins_fused", default=False) - if not Token.has_extension("inside_fused"): - Token.set_extension("inside_fused", default=False) - - merger = Matcher(docs[0].vocab) - merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) - for i, doc in enumerate(docs): - matches = [] - if doc.is_parsed: - matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] - seen_tokens = set() - with doc.retokenize() as retokenizer: - for span in spans: - span_tokens = set(range(span.start, span.end)) - if not span_tokens.intersection(seen_tokens): - retokenizer.merge(span) - seen_tokens.update(span_tokens) - - file_.write("# newdoc id = {i}\n".format(i=i)) - for j, sent in enumerate(doc.sents): - file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) - file_.write("# text = {text}\n".format(text=sent.text)) - for k, token in enumerate(sent): - if token.head.i > sent[-1].i or token.head.i < sent[0].i: - for word in doc[sent[0].i - 10 : sent[0].i]: - print(word.i, word.head.i, word.text, word.dep_) - for word in sent: - print(word.i, word.head.i, word.text, word.dep_) - for word in doc[sent[-1].i : sent[-1].i + 10]: - print(word.i, word.head.i, word.text, word.dep_) - raise ValueError( - "Invalid parse: head outside sentence (%s)" % token.text - ) - file_.write(token._.get_conllu_lines(k) + "\n") - file_.write("\n") - - -def print_progress(itn, losses, ud_scores): - fields = { - "dep_loss": losses.get("parser", 0.0), - "morph_loss": losses.get("morphologizer", 0.0), - "tag_loss": losses.get("tagger", 0.0), - "words": ud_scores["Words"].f1 * 100, - "sents": ud_scores["Sentences"].f1 * 100, - "tags": ud_scores["XPOS"].f1 * 100, - "uas": ud_scores["UAS"].f1 * 100, - "las": ud_scores["LAS"].f1 * 100, - "morph": ud_scores["Feats"].f1 * 100, - } - header = ["Epoch", "P.Loss", "M.Loss", "LAS", "UAS", "TAG", "MORPH", "SENT", "WORD"] - if itn == 0: - print("\t".join(header)) - tpl = "\t".join(( - "{:d}", - "{dep_loss:.1f}", - "{morph_loss:.1f}", - "{las:.1f}", - "{uas:.1f}", - "{tags:.1f}", - "{morph:.1f}", - "{sents:.1f}", - "{words:.1f}", - )) - print(tpl.format(itn, **fields)) - - -# def get_sent_conllu(sent, sent_id): -# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] - - -def get_token_conllu(token, i): - if token._.begins_fused: - n = 1 - while token.nbor(n)._.inside_fused: - n += 1 - id_ = "%d-%d" % (i, i + n) - lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"] - else: - lines = [] - if token.head.i == token.i: - head = 0 - else: - head = i + (token.head.i - token.i) + 1 - features = list(token.morph) - feat_str = [] - replacements = {"one": "1", "two": "2", "three": "3"} - for feat in features: - if "=" in feat: - feat_str.append(feat) - elif not feat.startswith("begin") and not feat.startswith("end"): - key, value = feat.split("_", 1) - value = replacements.get(value, value) - feat_str.append("%s=%s" % (key, value.title())) - if not feat_str: - feat_str = "_" - else: - feat_str = "|".join(feat_str) - fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str, - str(head), token.dep_.lower(), "_", "_"] - lines.append("\t".join(fields)) - return "\n".join(lines) - - -################## -# Initialization # -################## - - -def load_nlp(corpus, config, vectors=None): - lang = corpus.split("_")[0] - nlp = spacy.blank(lang) - if config.vectors: - if not vectors: - raise ValueError( - "config asks for vectors, but no vectors " - "directory set on command line (use -v)" - ) - if (Path(vectors) / corpus).exists(): - nlp.vocab.from_disk(Path(vectors) / corpus / "vocab") - nlp.meta["treebank"] = corpus - return nlp - - -def initialize_pipeline(nlp, examples, config, device): - nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False})) - nlp.add_pipe(nlp.create_pipe("morphologizer")) - nlp.add_pipe(nlp.create_pipe("parser")) - if config.multitask_tag: - nlp.parser.add_multitask_objective("tag") - if config.multitask_sent: - nlp.parser.add_multitask_objective("sent_start") - for eg in examples: - for tag in eg.get_aligned("TAG", as_string=True): - if tag is not None: - nlp.tagger.add_label(tag) - if torch is not None and device != -1: - torch.set_default_tensor_type("torch.cuda.FloatTensor") - optimizer = nlp.begin_training( - lambda: examples, - device=device, - subword_features=config.subword_features, - conv_depth=config.conv_depth, - bilstm_depth=config.bilstm_depth, - ) - if config.pretrained_tok2vec: - _load_pretrained_tok2vec(nlp, config.pretrained_tok2vec) - return optimizer - - -def _load_pretrained_tok2vec(nlp, loc): - """Load pretrained weights for the 'token-to-vector' part of the component - models, which is typically a CNN. See 'spacy pretrain'. Experimental. - """ - with Path(loc).open("rb", encoding="utf8") as file_: - weights_data = file_.read() - loaded = [] - for name, component in nlp.pipeline: - if hasattr(component, "model") and component.model.has_ref("tok2vec"): - component.get_ref("tok2vec").from_bytes(weights_data) - loaded.append(name) - return loaded - - -######################## -# Command line helpers # -######################## - - -class Config(object): - def __init__( - self, - vectors=None, - max_doc_length=10, - multitask_tag=False, - multitask_sent=False, - multitask_dep=False, - multitask_vectors=None, - bilstm_depth=0, - nr_epoch=30, - min_batch_size=100, - max_batch_size=1000, - batch_by_words=True, - dropout=0.2, - conv_depth=4, - subword_features=True, - vectors_dir=None, - pretrained_tok2vec=None, - ): - if vectors_dir is not None: - if vectors is None: - vectors = True - if multitask_vectors is None: - multitask_vectors = True - for key, value in locals().items(): - setattr(self, key, value) - - @classmethod - def load(cls, loc, vectors_dir=None): - with Path(loc).open("r", encoding="utf8") as file_: - cfg = json.load(file_) - if vectors_dir is not None: - cfg["vectors_dir"] = vectors_dir - return cls(**cfg) - - -class Dataset(object): - def __init__(self, path, section): - self.path = path - self.section = section - self.conllu = None - self.text = None - for file_path in self.path.iterdir(): - name = file_path.parts[-1] - if section in name and name.endswith("conllu"): - self.conllu = file_path - elif section in name and name.endswith("txt"): - self.text = file_path - if self.conllu is None: - msg = "Could not find .txt file in {path} for {section}" - raise IOError(msg.format(section=section, path=path)) - if self.text is None: - msg = "Could not find .txt file in {path} for {section}" - self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0] - - -class TreebankPaths(object): - def __init__(self, ud_path, treebank, **cfg): - self.train = Dataset(ud_path / treebank, "train") - self.dev = Dataset(ud_path / treebank, "dev") - self.lang = self.train.lang - - -@plac.annotations( - ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - parses_dir=("Directory to write the development parses", "positional", None, Path), - corpus=( - "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", - "positional", - None, - str, - ), - config=("Path to json formatted config file", "option", "C", Path), - limit=("Size limit", "option", "n", int), - gpu_device=("Use GPU", "option", "g", int), - use_oracle_segments=("Use oracle segments", "flag", "G", int), - vectors_dir=( - "Path to directory with pretrained vectors, named e.g. en/", - "option", - "v", - Path, - ), -) -def main( - ud_dir, - parses_dir, - corpus, - config=None, - limit=0, - gpu_device=-1, - vectors_dir=None, - use_oracle_segments=False, -): - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - - spacy.util.fix_random_seed() - lang.zh.Chinese.Defaults.use_jieba = False - lang.ja.Japanese.Defaults.use_janome = False - - if config is not None: - config = Config.load(config, vectors_dir=vectors_dir) - else: - config = Config(vectors_dir=vectors_dir) - paths = TreebankPaths(ud_dir, corpus) - if not (parses_dir / corpus).exists(): - (parses_dir / corpus).mkdir() - print("Train and evaluate", corpus, "using lang", paths.lang) - nlp = load_nlp(paths.lang, config, vectors=vectors_dir) - - examples = read_data( - nlp, - paths.train.conllu.open(encoding="utf8"), - paths.train.text.open(encoding="utf8"), - max_doc_length=config.max_doc_length, - limit=limit, - ) - - optimizer = initialize_pipeline(nlp, examples, config, gpu_device) - - batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) - beam_prob = compounding(0.2, 0.8, 1.001) - for i in range(config.nr_epoch): - examples = read_data( - nlp, - paths.train.conllu.open(encoding="utf8"), - paths.train.text.open(encoding="utf8"), - max_doc_length=config.max_doc_length, - limit=limit, - oracle_segments=use_oracle_segments, - raw_text=not use_oracle_segments, - ) - random.shuffle(examples) - if config.batch_by_words: - batches = minibatch_by_words(examples, size=batch_sizes) - else: - batches = minibatch(examples, size=batch_sizes) - losses = {} - n_train_words = sum(len(eg.predicted) for eg in examples) - with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - for batch in batches: - pbar.update(sum(len(ex.predicted) for ex in batch)) - nlp.parser.cfg["beam_update_prob"] = next(beam_prob) - nlp.update( - batch, - sgd=optimizer, - drop=config.dropout, - losses=losses, - ) - - out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) - with nlp.use_params(optimizer.averages): - if use_oracle_segments: - parsed_docs, scores = evaluate(nlp, paths.dev.conllu, - paths.dev.conllu, out_path) - else: - parsed_docs, scores = evaluate(nlp, paths.dev.text, - paths.dev.conllu, out_path) - print_progress(i, losses, scores) - - -def _render_parses(i, to_render): - to_render[0].user_data["title"] = "Batch %d" % i - with Path("/tmp/parses.html").open("w", encoding="utf8") as file_: - html = displacy.render(to_render[:5], style="dep", page=True) - file_.write(html) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 869077531..000000000 --- a/examples/README.md +++ /dev/null @@ -1,19 +0,0 @@ - - -# spaCy examples - -The examples are Python scripts with well-behaved command line interfaces. For -more detailed usage guides, see the [documentation](https://spacy.io/usage/). - -To see the available arguments, you can use the `--help` or `-h` flag: - -```bash -$ python examples/training/train_ner.py --help -``` - -While we try to keep the examples up to date, they are not currently exercised -by the test suite, as some of them require significant data downloads or take -time to train. If you find that an example is no longer running, -[please tell us](https://github.com/explosion/spaCy/issues)! We know there's -nothing worse than trying to figure out what you're doing wrong, and it turns -out your code was never the problem. diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py deleted file mode 100644 index bf857b8b7..000000000 --- a/examples/deep_learning_keras.py +++ /dev/null @@ -1,266 +0,0 @@ -""" -This example shows how to use an LSTM sentiment classification model trained -using Keras in spaCy. spaCy splits the document into sentences, and each -sentence is classified using the LSTM. The scores for the sentences are then -aggregated to give the document score. This kind of hierarchical model is quite -difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras -example on this dataset performs quite poorly, because it cuts off the documents -so that they're a fixed size. This hurts review accuracy a lot, because people -often summarise their rating in the final sentence - -Prerequisites: -spacy download en_vectors_web_lg -pip install keras==2.0.9 - -Compatible with: spaCy v2.0.0+ -""" -import ml_datasets -import plac -import random -import pathlib -import cytoolz -import numpy -from keras.models import Sequential, model_from_json -from keras.layers import LSTM, Dense, Embedding, Bidirectional -from keras.layers import TimeDistributed -from keras.optimizers import Adam -from spacy.compat import pickle -import spacy - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - with (path / "config.json").open() as file_: - model = model_from_json(file_.read()) - with (path / "model").open("rb") as file_: - lstm_weights = pickle.load(file_) - embeddings = get_embeddings(nlp.vocab) - model.set_weights([embeddings] + lstm_weights) - return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -def get_labelled_sentences(docs, doc_labels): - labels = [] - sentences = [] - for doc, y in zip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, numpy.asarray(labels, dtype="int32") - - -def get_features(docs, max_length): - docs = list(docs) - Xs = numpy.zeros((len(docs), max_length), dtype="int32") - for i, doc in enumerate(docs): - j = 0 - for token in doc: - vector_id = token.vocab.vectors.find(key=token.orth) - if vector_id >= 0: - Xs[i, j] = vector_id - else: - Xs[i, j] = 0 - j += 1 - if j >= max_length: - break - return Xs - - -def train( - train_texts, - train_labels, - dev_texts, - dev_labels, - lstm_shape, - lstm_settings, - lstm_optimizer, - batch_size=100, - nb_epoch=5, - by_sentence=True, -): - - print("Loading spaCy") - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(nlp.create_pipe("sentencizer")) - embeddings = get_embeddings(nlp.vocab) - model = compile_lstm(embeddings, lstm_shape, lstm_settings) - - print("Parsing texts...") - train_docs = list(nlp.pipe(train_texts)) - dev_docs = list(nlp.pipe(dev_texts)) - if by_sentence: - train_docs, train_labels = get_labelled_sentences(train_docs, train_labels) - dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels) - - train_X = get_features(train_docs, lstm_shape["max_length"]) - dev_X = get_features(dev_docs, lstm_shape["max_length"]) - model.fit( - train_X, - train_labels, - validation_data=(dev_X, dev_labels), - epochs=nb_epoch, - batch_size=batch_size, - ) - return model - - -def compile_lstm(embeddings, shape, settings): - model = Sequential() - model.add( - Embedding( - embeddings.shape[0], - embeddings.shape[1], - input_length=shape["max_length"], - trainable=False, - weights=[embeddings], - mask_zero=True, - ) - ) - model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False))) - model.add( - Bidirectional( - LSTM( - shape["nr_hidden"], - recurrent_dropout=settings["dropout"], - dropout=settings["dropout"], - ) - ) - ) - model.add(Dense(shape["nr_class"], activation="sigmoid")) - model.compile( - optimizer=Adam(lr=settings["lr"]), - loss="binary_crossentropy", - metrics=["accuracy"], - ) - return model - - -def get_embeddings(vocab): - return vocab.vectors.data - - -def evaluate(model_dir, texts, labels, max_length=100): - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length)) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (("pos", 1), ("neg", 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int), -) -def main( - model_dir=None, - train_dir=None, - dev_dir=None, - is_runtime=False, - nr_hidden=64, - max_length=100, # Shape - dropout=0.5, - learn_rate=0.001, # General NN config - nb_epoch=5, - batch_size=256, - nr_examples=-1, -): # Training params - if model_dir is not None: - model_dir = pathlib.Path(model_dir) - if train_dir is None or dev_dir is None: - imdb_data = ml_datasets.imdb() - if is_runtime: - if dev_dir is None: - dev_texts, dev_labels = zip(*imdb_data[1]) - else: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - if train_dir is None: - train_texts, train_labels = zip(*imdb_data[0]) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - if dev_dir is None: - dev_texts, dev_labels = zip(*imdb_data[1]) - else: - dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) - train_labels = numpy.asarray(train_labels, dtype="int32") - dev_labels = numpy.asarray(dev_labels, dtype="int32") - lstm = train( - train_texts, - train_labels, - dev_texts, - dev_labels, - {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1}, - {"dropout": dropout, "lr": learn_rate}, - {}, - nb_epoch=nb_epoch, - batch_size=batch_size, - ) - weights = lstm.get_weights() - if model_dir is not None: - with (model_dir / "model").open("wb") as file_: - pickle.dump(weights[1:], file_) - with (model_dir / "config.json").open("w") as file_: - file_.write(lstm.to_json()) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py deleted file mode 100644 index c40a3c10d..000000000 --- a/examples/information_extraction/entity_relations.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""A simple example of extracting relations between phrases and entities using -spaCy's named entity recognizer and the dependency parse. Here, we extract -money and currency values (entities labelled as MONEY) and then check the -dependency tree to find the noun phrase they are referring to – for example: -$9.4 million --> Net income. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.2.1 -""" -from __future__ import unicode_literals, print_function - -import plac -import spacy - - -TEXTS = [ - "Net income was $9.4 million compared to the prior year of $2.7 million.", - "Revenue exceeded twelve billion dollars, with a loss of $1b.", -] - - -@plac.annotations( - model=("Model to load (needs parser and NER)", "positional", None, str) -) -def main(model="en_core_web_sm"): - nlp = spacy.load(model) - print("Loaded model '%s'" % model) - print("Processing %d texts" % len(TEXTS)) - - for text in TEXTS: - doc = nlp(text) - relations = extract_currency_relations(doc) - for r1, r2 in relations: - print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text)) - - -def filter_spans(spans): - # Filter a sequence of spans so they don't contain overlaps - # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans() - get_sort_key = lambda span: (span.end - span.start, -span.start) - sorted_spans = sorted(spans, key=get_sort_key, reverse=True) - result = [] - seen_tokens = set() - for span in sorted_spans: - # Check for end - 1 here because boundaries are inclusive - if span.start not in seen_tokens and span.end - 1 not in seen_tokens: - result.append(span) - seen_tokens.update(range(span.start, span.end)) - result = sorted(result, key=lambda span: span.start) - return result - - -def extract_currency_relations(doc): - # Merge entities and noun chunks into one token - spans = list(doc.ents) + list(doc.noun_chunks) - spans = filter_spans(spans) - with doc.retokenize() as retokenizer: - for span in spans: - retokenizer.merge(span) - - relations = [] - for money in filter(lambda w: w.ent_type_ == "MONEY", doc): - if money.dep_ in ("attr", "dobj"): - subject = [w for w in money.head.lefts if w.dep_ == "nsubj"] - if subject: - subject = subject[0] - relations.append((subject, money)) - elif money.dep_ == "pobj" and money.head.dep_ == "prep": - relations.append((money.head.head, money)) - return relations - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Net income MONEY $9.4 million - # the prior year MONEY $2.7 million - # Revenue MONEY twelve billion dollars - # a loss MONEY 1b diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py deleted file mode 100644 index 2ca9da1ea..000000000 --- a/examples/information_extraction/parse_subtrees.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""This example shows how to navigate the parse tree including subtrees -attached to a word. - -Based on issue #252: -"In the documents and tutorials the main thing I haven't found is -examples on how to break sentences down into small sub thoughts/chunks. The -noun_chunks is handy, but having examples on using the token.head to find small -(near-complete) sentence chunks would be neat. Lets take the example sentence: -"displaCy uses CSS and JavaScript to show you how computers understand language" - -This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: -[displaCy] uses CSS and Javascript [to + show] -show you how computers understand [language] - -I'm assuming that we can use the token.head to build these groups." - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import spacy - - -@plac.annotations(model=("Model to load", "positional", None, str)) -def main(model="en_core_web_sm"): - nlp = spacy.load(model) - print("Loaded model '%s'" % model) - - doc = nlp( - "displaCy uses CSS and JavaScript to show you how computers " - "understand language" - ) - - # The easiest way is to find the head of the subtree you want, and then use - # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` - # is the one that does what you're asking for most directly: - for word in doc: - if word.dep_ in ("xcomp", "ccomp"): - print("".join(w.text_with_ws for w in word.subtree)) - - # It'd probably be better for `word.subtree` to return a `Span` object - # instead of a generator over the tokens. If you want the `Span` you can - # get it via the `.right_edge` and `.left_edge` properties. The `Span` - # object is nice because you can easily get a vector, merge it, etc. - for word in doc: - if word.dep_ in ("xcomp", "ccomp"): - subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] - print(subtree_span.text, "|", subtree_span.root.text) - - # You might also want to select a head, and then select a start and end - # position by walking along its children. You could then take the - # `.left_edge` and `.right_edge` of those tokens, and use it to calculate - # a span. - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # to show you how computers understand language - # how computers understand language - # to show you how computers understand language | show - # how computers understand language | understand diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py deleted file mode 100644 index f3622bfdd..000000000 --- a/examples/information_extraction/phrase_matcher.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Match a large set of multi-word expressions in O(1) time. - -The idea is to associate each word in the vocabulary with a tag, noting whether -they begin, end, or are inside at least one pattern. An additional tag is used -for single-word patterns. Complete patterns are also stored in a hash set. -When we process a document, we look up the words in the vocabulary, to -associate the words with the tags. We then search for tag-sequences that -correspond to valid candidates. Finally, we look up the candidates in the hash -set. - -For instance, to search for the phrases "Barack Hussein Obama" and "Hilary -Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with -the I tag, and Obama and Clinton with the L tag. - -The document "Barack Clinton and Hilary Clinton" would have the tag sequence -[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second -candidate is in the phrase dictionary, so only one is returned as a match. - -The algorithm is O(n) at run-time for document of length n because we're only -ever matching over the tag patterns. So no matter how many phrases we're -looking for, our pattern set stays very small (exact size depends on the -maximum length we're looking for, as the query language currently has no -quantifiers). - -The example expects a .bz2 file from the Reddit corpus, and a patterns file, -formatted in jsonl as a sequence of entries like this: - -{"text":"Anchorage"} -{"text":"Angola"} -{"text":"Ann Arbor"} -{"text":"Annapolis"} -{"text":"Appalachia"} -{"text":"Argentina"} - -Reddit comments corpus: -* https://files.pushshift.io/reddit/ -* https://archive.org/details/2015_reddit_comments_corpus - -Compatible with: spaCy v2.0.0+ -""" -from __future__ import print_function, unicode_literals, division - -from bz2 import BZ2File -import time -import plac -import json - -from spacy.matcher import PhraseMatcher -import spacy - - -@plac.annotations( - patterns_loc=("Path to gazetteer", "positional", None, str), - text_loc=("Path to Reddit corpus file", "positional", None, str), - n=("Number of texts to read", "option", "n", int), - lang=("Language class to initialise", "option", "l", str), -) -def main(patterns_loc, text_loc, n=10000, lang="en"): - nlp = spacy.blank(lang) - nlp.vocab.lex_attr_getters = {} - phrases = read_gazetteer(nlp.tokenizer, patterns_loc) - count = 0 - t1 = time.time() - for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): - count += 1 - t2 = time.time() - print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) - - -def read_gazetteer(tokenizer, loc, n=-1): - for i, line in enumerate(open(loc)): - data = json.loads(line.strip()) - phrase = tokenizer(data["text"]) - for w in phrase: - _ = tokenizer.vocab[w.text] - if len(phrase) >= 2: - yield phrase - - -def read_text(bz2_loc, n=10000): - with BZ2File(bz2_loc) as file_: - for i, line in enumerate(file_): - data = json.loads(line) - yield data["body"] - if i >= n: - break - - -def get_matches(tokenizer, phrases, texts): - matcher = PhraseMatcher(tokenizer.vocab) - matcher.add("Phrase", None, *phrases) - for text in texts: - doc = tokenizer(text) - for w in doc: - _ = doc.vocab[w.text] - matches = matcher(doc) - for ent_id, start, end in matches: - yield (ent_id, doc[start:end].text) - - -if __name__ == "__main__": - if False: - import cProfile - import pstats - - cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() - else: - plac.call(main) diff --git a/examples/keras_parikh_entailment/README.md b/examples/keras_parikh_entailment/README.md deleted file mode 100644 index 86ba50d9b..000000000 --- a/examples/keras_parikh_entailment/README.md +++ /dev/null @@ -1,114 +0,0 @@ - - -# A decomposable attention model for Natural Language Inference -**by Matthew Honnibal, [@honnibal](https://github.com/honnibal)** -**Updated for spaCy 2.0+ and Keras 2.2.2+ by John Stewart, [@free-variation](https://github.com/free-variation)** - -This directory contains an implementation of the entailment prediction model described -by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable -for its competitive performance with very few parameters. - -The model is implemented using [Keras](https://keras.io/) and [spaCy](https://spacy.io). -Keras is used to build and train the network. spaCy is used to load -the [GloVe](http://nlp.stanford.edu/projects/glove/) vectors, perform the -feature extraction, and help you apply the model at run-time. The following -demo code shows how the entailment model can be used at runtime, once the -hook is installed to customise the `.similarity()` method of spaCy's `Doc` -and `Span` objects: - -```python -def demo(shape): - nlp = spacy.load('en_vectors_web_lg') - nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0])) - - doc1 = nlp(u'The king of France is bald.') - doc2 = nlp(u'France has no king.') - - print("Sentence 1:", doc1) - print("Sentence 2:", doc2) - - entailment_type, confidence = doc1.similarity(doc2) - print("Entailment type:", entailment_type, "(Confidence:", confidence, ")") -``` - -Which gives the output `Entailment type: contradiction (Confidence: 0.60604566)`, showing that -the system has definite opinions about Betrand Russell's [famous conundrum](https://users.drew.edu/jlenz/br-on-denoting.html)! - -I'm working on a blog post to explain Parikh et al.'s model in more detail. -A [notebook](https://github.com/free-variation/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb) is available that briefly explains this implementation. -I think it is a very interesting example of the attention mechanism, which -I didn't understand very well before working through this paper. There are -lots of ways to extend the model. - -## What's where - -| File | Description | -| --- | --- | -| `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. | -| `spacy_hook.py` | Provides a class `KerasSimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. | -| `keras_decomposable_attention.py` | Defines the neural network model. | - -## Setting up - -First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spaCy -English models (about 1GB of data): - -```bash -pip install keras -pip install spacy -python -m spacy download en_vectors_web_lg -``` - -You'll also want to get Keras working on your GPU, and you will need a backend, such as TensorFlow or Theano. -This will depend on your set up, so you're mostly on your own for this step. If you're using AWS, try the -[NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy. - -Once you've installed the dependencies, you can run a small preliminary test of -the Keras model: - -```bash -py.test keras_parikh_entailment/keras_decomposable_attention.py -``` - -This compiles the model and fits it with some dummy data. You should see that -both tests passed. - -Finally, download the [Stanford Natural Language Inference corpus](http://nlp.stanford.edu/projects/snli/). - -## Running the example - -You can run the `keras_parikh_entailment/` directory as a script, which executes the file -[`keras_parikh_entailment/__main__.py`](__main__.py). If you run the script without arguments -the usage is shown. Running it with `-h` explains the command line arguments. - -The first thing you'll want to do is train the model: - -```bash -python keras_parikh_entailment/ train -t -s -``` - -Training takes about 300 epochs for full accuracy, and I haven't rerun the full -experiment since refactoring things to publish this example — please let me -know if I've broken something. You should get to at least 85% on the development data even after 10-15 epochs. - -The other two modes demonstrate run-time usage. I never like relying on the accuracy printed -by `.fit()` methods. I never really feel confident until I've run a new process that loads -the model and starts making predictions, without access to the gold labels. I've therefore -included an `evaluate` mode. - -```bash -python keras_parikh_entailment/ evaluate -s -``` - -Finally, there's also a little demo, which mostly exists to show -you how run-time usage will eventually look. - -```bash -python keras_parikh_entailment/ demo -``` - -## Getting updates - -We should have the blog post explaining the model ready before the end of the week. To get -notified when it's published, you can either follow me on [Twitter](https://twitter.com/honnibal) -or subscribe to our [mailing list](http://eepurl.com/ckUpQ5). diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py deleted file mode 100644 index ad398dae3..000000000 --- a/examples/keras_parikh_entailment/__main__.py +++ /dev/null @@ -1,207 +0,0 @@ -import numpy as np -import json -from keras.utils import to_categorical -import plac -import sys - -from keras_decomposable_attention import build_model -from spacy_hook import get_embeddings, KerasSimilarityShim - -try: - import cPickle as pickle -except ImportError: - import pickle - -import spacy - -# workaround for keras/tensorflow bug -# see https://github.com/tensorflow/tensorflow/issues/3388 -import os -import importlib -from keras import backend as K - - -def set_keras_backend(backend): - if K.backend() != backend: - os.environ["KERAS_BACKEND"] = backend - importlib.reload(K) - assert K.backend() == backend - if backend == "tensorflow": - K.get_session().close() - cfg = K.tf.ConfigProto() - cfg.gpu_options.allow_growth = True - K.set_session(K.tf.Session(config=cfg)) - K.clear_session() - - -set_keras_backend("tensorflow") - - -def train(train_loc, dev_loc, shape, settings): - train_texts1, train_texts2, train_labels = read_snli(train_loc) - dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) - - print("Loading spaCy") - nlp = spacy.load("en_vectors_web_lg") - assert nlp.path is not None - print("Processing texts...") - train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0]) - dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0]) - - print("Compiling network") - model = build_model(get_embeddings(nlp.vocab), shape, settings) - - print(settings) - model.fit( - train_X, - train_labels, - validation_data=(dev_X, dev_labels), - epochs=settings["nr_epoch"], - batch_size=settings["batch_size"], - ) - if not (nlp.path / "similarity").exists(): - (nlp.path / "similarity").mkdir() - print("Saving to", nlp.path / "similarity") - weights = model.get_weights() - # remove the embedding matrix. We can reconstruct it. - del weights[1] - with (nlp.path / "similarity" / "model").open("wb") as file_: - pickle.dump(weights, file_) - with (nlp.path / "similarity" / "config.json").open("w") as file_: - file_.write(model.to_json()) - - -def evaluate(dev_loc, shape): - dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0])) - total = 0.0 - correct = 0.0 - for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels): - doc1 = nlp(text1) - doc2 = nlp(text2) - sim, _ = doc1.similarity(doc2) - if sim == KerasSimilarityShim.entailment_types[label.argmax()]: - correct += 1 - total += 1 - return correct, total - - -def demo(shape): - nlp = spacy.load("en_vectors_web_lg") - nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0])) - - doc1 = nlp("The king of France is bald.") - doc2 = nlp("France has no king.") - - print("Sentence 1:", doc1) - print("Sentence 2:", doc2) - - entailment_type, confidence = doc1.similarity(doc2) - print("Entailment type:", entailment_type, "(Confidence:", confidence, ")") - - -LABELS = {"entailment": 0, "contradiction": 1, "neutral": 2} - - -def read_snli(path): - texts1 = [] - texts2 = [] - labels = [] - with open(path, "r") as file_: - for line in file_: - eg = json.loads(line) - label = eg["gold_label"] - if label == "-": # per Parikh, ignore - SNLI entries - continue - texts1.append(eg["sentence1"]) - texts2.append(eg["sentence2"]) - labels.append(LABELS[label]) - return texts1, texts2, to_categorical(np.asarray(labels, dtype="int32")) - - -def create_dataset(nlp, texts, hypotheses, num_unk, max_length): - sents = texts + hypotheses - sents_as_ids = [] - for sent in sents: - doc = nlp(sent) - word_ids = [] - for i, token in enumerate(doc): - # skip odd spaces from tokenizer - if token.has_vector and token.vector_norm == 0: - continue - - if i > max_length: - break - - if token.has_vector: - word_ids.append(token.rank + num_unk + 1) - else: - # if we don't have a vector, pick an OOV entry - word_ids.append(token.rank % num_unk + 1) - - # there must be a simpler way of generating padded arrays from lists... - word_id_vec = np.zeros((max_length), dtype="int") - clipped_len = min(max_length, len(word_ids)) - word_id_vec[:clipped_len] = word_ids[:clipped_len] - sents_as_ids.append(word_id_vec) - - return [np.array(sents_as_ids[: len(texts)]), np.array(sents_as_ids[len(texts) :])] - - -@plac.annotations( - mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]), - train_loc=("Path to training data", "option", "t", str), - dev_loc=("Path to development or test data", "option", "s", str), - max_length=("Length to truncate sentences", "option", "L", int), - nr_hidden=("Number of hidden units", "option", "H", int), - dropout=("Dropout level", "option", "d", float), - learn_rate=("Learning rate", "option", "r", float), - batch_size=("Batch size for neural network training", "option", "b", int), - nr_epoch=("Number of training epochs", "option", "e", int), - entail_dir=( - "Direction of entailment", - "option", - "D", - str, - ["both", "left", "right"], - ), -) -def main( - mode, - train_loc, - dev_loc, - max_length=50, - nr_hidden=200, - dropout=0.2, - learn_rate=0.001, - batch_size=1024, - nr_epoch=10, - entail_dir="both", -): - shape = (max_length, nr_hidden, 3) - settings = { - "lr": learn_rate, - "dropout": dropout, - "batch_size": batch_size, - "nr_epoch": nr_epoch, - "entail_dir": entail_dir, - } - - if mode == "train": - if train_loc == None or dev_loc == None: - print("Train mode requires paths to training and development data sets.") - sys.exit(1) - train(train_loc, dev_loc, shape, settings) - elif mode == "evaluate": - if dev_loc == None: - print("Evaluate mode requires paths to test data set.") - sys.exit(1) - correct, total = evaluate(dev_loc, shape) - print(correct, "/", total, correct / total) - else: - demo(shape) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py deleted file mode 100644 index 2e17a11ee..000000000 --- a/examples/keras_parikh_entailment/keras_decomposable_attention.py +++ /dev/null @@ -1,152 +0,0 @@ -# Semantic entailment/similarity with decomposable attention (using spaCy and Keras) -# Practical state-of-the-art textual entailment with spaCy and Keras - -import numpy as np -from keras import layers, Model, models, optimizers -from keras import backend as K - - -def build_model(vectors, shape, settings): - max_length, nr_hidden, nr_class = shape - - input1 = layers.Input(shape=(max_length,), dtype="int32", name="words1") - input2 = layers.Input(shape=(max_length,), dtype="int32", name="words2") - - # embeddings (projected) - embed = create_embedding(vectors, max_length, nr_hidden) - - a = embed(input1) - b = embed(input2) - - # step 1: attend - F = create_feedforward(nr_hidden) - att_weights = layers.dot([F(a), F(b)], axes=-1) - - G = create_feedforward(nr_hidden) - - if settings["entail_dir"] == "both": - norm_weights_a = layers.Lambda(normalizer(1))(att_weights) - norm_weights_b = layers.Lambda(normalizer(2))(att_weights) - alpha = layers.dot([norm_weights_a, a], axes=1) - beta = layers.dot([norm_weights_b, b], axes=1) - - # step 2: compare - comp1 = layers.concatenate([a, beta]) - comp2 = layers.concatenate([b, alpha]) - v1 = layers.TimeDistributed(G)(comp1) - v2 = layers.TimeDistributed(G)(comp2) - - # step 3: aggregate - v1_sum = layers.Lambda(sum_word)(v1) - v2_sum = layers.Lambda(sum_word)(v2) - concat = layers.concatenate([v1_sum, v2_sum]) - - elif settings["entail_dir"] == "left": - norm_weights_a = layers.Lambda(normalizer(1))(att_weights) - alpha = layers.dot([norm_weights_a, a], axes=1) - comp2 = layers.concatenate([b, alpha]) - v2 = layers.TimeDistributed(G)(comp2) - v2_sum = layers.Lambda(sum_word)(v2) - concat = v2_sum - - else: - norm_weights_b = layers.Lambda(normalizer(2))(att_weights) - beta = layers.dot([norm_weights_b, b], axes=1) - comp1 = layers.concatenate([a, beta]) - v1 = layers.TimeDistributed(G)(comp1) - v1_sum = layers.Lambda(sum_word)(v1) - concat = v1_sum - - H = create_feedforward(nr_hidden) - out = H(concat) - out = layers.Dense(nr_class, activation="softmax")(out) - - model = Model([input1, input2], out) - - model.compile( - optimizer=optimizers.Adam(lr=settings["lr"]), - loss="categorical_crossentropy", - metrics=["accuracy"], - ) - - return model - - -def create_embedding(vectors, max_length, projected_dim): - return models.Sequential( - [ - layers.Embedding( - vectors.shape[0], - vectors.shape[1], - input_length=max_length, - weights=[vectors], - trainable=False, - ), - layers.TimeDistributed( - layers.Dense(projected_dim, activation=None, use_bias=False) - ), - ] - ) - - -def create_feedforward(num_units=200, activation="relu", dropout_rate=0.2): - return models.Sequential( - [ - layers.Dense(num_units, activation=activation), - layers.Dropout(dropout_rate), - layers.Dense(num_units, activation=activation), - layers.Dropout(dropout_rate), - ] - ) - - -def normalizer(axis): - def _normalize(att_weights): - exp_weights = K.exp(att_weights) - sum_weights = K.sum(exp_weights, axis=axis, keepdims=True) - return exp_weights / sum_weights - - return _normalize - - -def sum_word(x): - return K.sum(x, axis=1) - - -def test_build_model(): - vectors = np.ndarray((100, 8), dtype="float32") - shape = (10, 16, 3) - settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"} - model = build_model(vectors, shape, settings) - - -def test_fit_model(): - def _generate_X(nr_example, length, nr_vector): - X1 = np.ndarray((nr_example, length), dtype="int32") - X1 *= X1 < nr_vector - X1 *= 0 <= X1 - X2 = np.ndarray((nr_example, length), dtype="int32") - X2 *= X2 < nr_vector - X2 *= 0 <= X2 - return [X1, X2] - - def _generate_Y(nr_example, nr_class): - ys = np.zeros((nr_example, nr_class), dtype="int32") - for i in range(nr_example): - ys[i, i % nr_class] = 1 - return ys - - vectors = np.ndarray((100, 8), dtype="float32") - shape = (10, 16, 3) - settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"} - model = build_model(vectors, shape, settings) - - train_X = _generate_X(20, shape[0], vectors.shape[0]) - train_Y = _generate_Y(20, shape[2]) - dev_X = _generate_X(15, shape[0], vectors.shape[0]) - dev_Y = _generate_Y(15, shape[2]) - - model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4) - - -__all__ = [build_model] diff --git a/examples/keras_parikh_entailment/spacy_hook.py b/examples/keras_parikh_entailment/spacy_hook.py deleted file mode 100644 index 307669a70..000000000 --- a/examples/keras_parikh_entailment/spacy_hook.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -from keras.models import model_from_json - -try: - import cPickle as pickle -except ImportError: - import pickle - - -class KerasSimilarityShim(object): - entailment_types = ["entailment", "contradiction", "neutral"] - - @classmethod - def load(cls, path, nlp, max_length=100, get_features=None): - - if get_features is None: - get_features = get_word_ids - - with (path / "config.json").open() as file_: - model = model_from_json(file_.read()) - with (path / "model").open("rb") as file_: - weights = pickle.load(file_) - - embeddings = get_embeddings(nlp.vocab) - weights.insert(1, embeddings) - model.set_weights(weights) - - return cls(model, get_features=get_features, max_length=max_length) - - def __init__(self, model, get_features=None, max_length=100): - self.model = model - self.get_features = get_features - self.max_length = max_length - - def __call__(self, doc): - doc.user_hooks["similarity"] = self.predict - doc.user_span_hooks["similarity"] = self.predict - - return doc - - def predict(self, doc1, doc2): - x1 = self.get_features([doc1], max_length=self.max_length) - x2 = self.get_features([doc2], max_length=self.max_length) - scores = self.model.predict([x1, x2]) - - return self.entailment_types[scores.argmax()], scores.max() - - -def get_embeddings(vocab, nr_unk=100): - # the extra +1 is for a zero vector representing sentence-final padding - num_vectors = max(lex.rank for lex in vocab) + 2 - - # create random vectors for OOV tokens - oov = np.random.normal(size=(nr_unk, vocab.vectors_length)) - oov = oov / oov.sum(axis=1, keepdims=True) - - vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32") - vectors[1 : (nr_unk + 1),] = oov - for lex in vocab: - if lex.has_vector and lex.vector_norm > 0: - vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm - - return vectors - - -def get_word_ids(docs, max_length=100, nr_unk=100): - Xs = np.zeros((len(docs), max_length), dtype="int32") - - for i, doc in enumerate(docs): - for j, token in enumerate(doc): - if j == max_length: - break - if token.has_vector: - Xs[i, j] = token.rank + nr_unk + 1 - else: - Xs[i, j] = token.rank % nr_unk + 1 - return Xs diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py deleted file mode 100644 index f26e7fc49..000000000 --- a/examples/load_from_docbin.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding: utf-8 -""" -Example of loading previously parsed text using spaCy's DocBin class. The example -performs an entity count to show that the annotations are available. -For more details, see https://spacy.io/usage/saving-loading#docs -Installation: -python -m spacy download en_core_web_lg -Usage: -python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy -""" -from __future__ import unicode_literals - -import spacy -from spacy.tokens import DocBin -from timeit import default_timer as timer -from collections import Counter - -EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy" - - -def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH): - nlp = spacy.load(model) - print("Reading data from {}".format(docbin_path)) - with open(docbin_path, "rb") as file_: - bytes_data = file_.read() - nr_word = 0 - start_time = timer() - entities = Counter() - docbin = DocBin().from_bytes(bytes_data) - for doc in docbin.get_docs(nlp.vocab): - nr_word += len(doc) - entities.update((e.label_, e.text) for e in doc.ents) - end_time = timer() - msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)" - wps = nr_word / (end_time - start_time) - print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps)) - print("Most common entities:") - for (label, entity), freq in entities.most_common(30): - print(freq, entity, label) - - -if __name__ == "__main__": - import plac - - plac.call(main) diff --git a/examples/notebooks/Decompositional Attention.ipynb b/examples/notebooks/Decompositional Attention.ipynb deleted file mode 100644 index 8baaf7d33..000000000 --- a/examples/notebooks/Decompositional Attention.ipynb +++ /dev/null @@ -1,955 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Natural language inference using spaCy and Keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook details an implementation of the natural language inference model presented in [(Parikh et al, 2016)](https://arxiv.org/abs/1606.01933). The model is notable for the small number of paramaters *and hyperparameters* it specifices, while still yielding good performance." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Constructing the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import spacy\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We only need the GloVe vectors from spaCy, not a full NLP pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "nlp = spacy.load('en_vectors_web_lg')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Function to load the SNLI dataset. The categories are converted to one-shot representation. The function comes from an example in spaCy." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jds/tensorflow-gpu/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", - " from ._conv import register_converters as _register_converters\n", - "Using TensorFlow backend.\n" - ] - } - ], - "source": [ - "import json\n", - "from keras.utils import to_categorical\n", - "\n", - "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n", - "def read_snli(path):\n", - " texts1 = []\n", - " texts2 = []\n", - " labels = []\n", - " with open(path, 'r') as file_:\n", - " for line in file_:\n", - " eg = json.loads(line)\n", - " label = eg['gold_label']\n", - " if label == '-': # per Parikh, ignore - SNLI entries\n", - " continue\n", - " texts1.append(eg['sentence1'])\n", - " texts2.append(eg['sentence2'])\n", - " labels.append(LABELS[label])\n", - " return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because Keras can do the train/test split for us, we'll load *all* SNLI triples from one file." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "texts,hypotheses,labels = read_snli('snli/snli_1.0_train.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):\n", - " sents = texts + hypotheses\n", - " \n", - " # the extra +1 is for a zero vector represting NULL for padding\n", - " num_vectors = max(lex.rank for lex in nlp.vocab) + 2 \n", - " \n", - " # create random vectors for OOV tokens\n", - " oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))\n", - " oov = oov / oov.sum(axis=1, keepdims=True)\n", - " \n", - " vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')\n", - " vectors[num_vectors:, ] = oov\n", - " for lex in nlp.vocab:\n", - " if lex.has_vector and lex.vector_norm > 0:\n", - " vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector\n", - " \n", - " sents_as_ids = []\n", - " for sent in sents:\n", - " doc = nlp(sent)\n", - " word_ids = []\n", - " \n", - " for i, token in enumerate(doc):\n", - " # skip odd spaces from tokenizer\n", - " if token.has_vector and token.vector_norm == 0:\n", - " continue\n", - " \n", - " if i > max_length:\n", - " break\n", - " \n", - " if token.has_vector:\n", - " word_ids.append(token.rank + 1)\n", - " else:\n", - " # if we don't have a vector, pick an OOV entry\n", - " word_ids.append(token.rank % num_oov + num_vectors) \n", - " \n", - " # there must be a simpler way of generating padded arrays from lists...\n", - " word_id_vec = np.zeros((max_length), dtype='int')\n", - " clipped_len = min(max_length, len(word_ids))\n", - " word_id_vec[:clipped_len] = word_ids[:clipped_len]\n", - " sents_as_ids.append(word_id_vec)\n", - " \n", - " \n", - " return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "sem_vectors, text_vectors, hypothesis_vectors = create_dataset(nlp, texts, hypotheses, 100, 50, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "texts_test,hypotheses_test,labels_test = read_snli('snli/snli_1.0_test.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "_, text_vectors_test, hypothesis_vectors_test = create_dataset(nlp, texts_test, hypotheses_test, 100, 50, True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use spaCy to tokenize the sentences and return, when available, a semantic vector for each token. \n", - "\n", - "OOV terms (tokens for which no semantic vector is available) are assigned to one of a set of randomly-generated OOV vectors, per (Parikh et al, 2016).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we will clip sentences to 50 words maximum." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from keras import layers, Model, models\n", - "from keras import backend as K" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building the model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The embedding layer copies the 300-dimensional GloVe vectors into GPU memory. Per (Parikh et al, 2016), the vectors, which are not adapted during training, are projected down to lower-dimensional vectors using a trained projection matrix." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def create_embedding(vectors, max_length, projected_dim):\n", - " return models.Sequential([\n", - " layers.Embedding(\n", - " vectors.shape[0],\n", - " vectors.shape[1],\n", - " input_length=max_length,\n", - " weights=[vectors],\n", - " trainable=False),\n", - " \n", - " layers.TimeDistributed(\n", - " layers.Dense(projected_dim,\n", - " activation=None,\n", - " use_bias=False))\n", - " ])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Parikh model makes use of three feedforward blocks that construct nonlinear combinations of their input. Each block contains two ReLU layers and two dropout layers." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):\n", - " return models.Sequential([\n", - " layers.Dense(num_units, activation=activation),\n", - " layers.Dropout(dropout_rate),\n", - " layers.Dense(num_units, activation=activation),\n", - " layers.Dropout(dropout_rate)\n", - " ])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The basic idea of the (Parikh et al, 2016) model is to:\n", - "\n", - "1. *Align*: Construct an alignment of subphrases in the text and hypothesis using an attention-like mechanism, called \"decompositional\" because the layer is applied to each of the two sentences individually rather than to their product. The dot product of the nonlinear transformations of the inputs is then normalized vertically and horizontally to yield a pair of \"soft\" alignment structures, from text->hypothesis and hypothesis->text. Concretely, for each word in one sentence, a multinomial distribution is computed over the words of the other sentence, by learning a multinomial logistic with softmax target.\n", - "2. *Compare*: Each word is now compared to its aligned phrase using a function modeled as a two-layer feedforward ReLU network. The output is a high-dimensional representation of the strength of association between word and aligned phrase.\n", - "3. *Aggregate*: The comparison vectors are summed, separately, for the text and the hypothesis. The result is two vectors: one that describes the degree of association of the text to the hypothesis, and the second, of the hypothesis to the text.\n", - "4. Finally, these two vectors are processed by a dense layer followed by a softmax classifier, as usual.\n", - "\n", - "Note that because in entailment the truth conditions of the consequent must be a subset of those of the antecedent, it is not obvious that we need both vectors in step (3). Entailment is not symmetric. It may be enough to just use the hypothesis->text vector. We will explore this possibility later." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need a couple of little functions for Lambda layers to normalize and aggregate weights:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def normalizer(axis):\n", - " def _normalize(att_weights):\n", - " exp_weights = K.exp(att_weights)\n", - " sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)\n", - " return exp_weights/sum_weights\n", - " return _normalize\n", - "\n", - "def sum_word(x):\n", - " return K.sum(x, axis=1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(vectors, max_length, num_hidden, num_classes, projected_dim, entail_dir='both'):\n", - " input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')\n", - " input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')\n", - " \n", - " # embeddings (projected)\n", - " embed = create_embedding(vectors, max_length, projected_dim)\n", - " \n", - " a = embed(input1)\n", - " b = embed(input2)\n", - " \n", - " # step 1: attend\n", - " F = create_feedforward(num_hidden)\n", - " att_weights = layers.dot([F(a), F(b)], axes=-1)\n", - " \n", - " G = create_feedforward(num_hidden)\n", - " \n", - " if entail_dir == 'both':\n", - " norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n", - " norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n", - " alpha = layers.dot([norm_weights_a, a], axes=1)\n", - " beta = layers.dot([norm_weights_b, b], axes=1)\n", - "\n", - " # step 2: compare\n", - " comp1 = layers.concatenate([a, beta])\n", - " comp2 = layers.concatenate([b, alpha])\n", - " v1 = layers.TimeDistributed(G)(comp1)\n", - " v2 = layers.TimeDistributed(G)(comp2)\n", - "\n", - " # step 3: aggregate\n", - " v1_sum = layers.Lambda(sum_word)(v1)\n", - " v2_sum = layers.Lambda(sum_word)(v2)\n", - " concat = layers.concatenate([v1_sum, v2_sum])\n", - " elif entail_dir == 'left':\n", - " norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n", - " alpha = layers.dot([norm_weights_a, a], axes=1)\n", - " comp2 = layers.concatenate([b, alpha])\n", - " v2 = layers.TimeDistributed(G)(comp2)\n", - " v2_sum = layers.Lambda(sum_word)(v2)\n", - " concat = v2_sum\n", - " else:\n", - " norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n", - " beta = layers.dot([norm_weights_b, b], axes=1)\n", - " comp1 = layers.concatenate([a, beta])\n", - " v1 = layers.TimeDistributed(G)(comp1)\n", - " v1_sum = layers.Lambda(sum_word)(v1)\n", - " concat = v1_sum\n", - " \n", - " H = create_feedforward(num_hidden)\n", - " out = H(concat)\n", - " out = layers.Dense(num_classes, activation='softmax')(out)\n", - " \n", - " model = Model([input1, input2], out)\n", - " \n", - " model.compile(optimizer='adam',\n", - " loss='categorical_crossentropy',\n", - " metrics=['accuracy'])\n", - " return model\n", - " \n", - " \n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "words1 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "words2 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "sequential_1 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n", - " words2[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_2 (Sequential) (None, 50, 200) 80400 sequential_1[1][0] \n", - " sequential_1[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_1 (Dot) (None, 50, 50) 0 sequential_2[1][0] \n", - " sequential_2[2][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_2 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_1 (Lambda) (None, 50, 50) 0 dot_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "dot_3 (Dot) (None, 50, 200) 0 lambda_2[0][0] \n", - " sequential_1[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_2 (Dot) (None, 50, 200) 0 lambda_1[0][0] \n", - " sequential_1[1][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_1 (Concatenate) (None, 50, 400) 0 sequential_1[1][0] \n", - " dot_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_2 (Concatenate) (None, 50, 400) 0 sequential_1[2][0] \n", - " dot_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_2 (TimeDistrib (None, 50, 200) 120400 concatenate_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_3 (TimeDistrib (None, 50, 200) 120400 concatenate_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_3 (Lambda) (None, 200) 0 time_distributed_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_4 (Lambda) (None, 200) 0 time_distributed_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_3 (Concatenate) (None, 400) 0 lambda_3[0][0] \n", - " lambda_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_4 (Sequential) (None, 200) 120400 concatenate_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_8 (Dense) (None, 3) 603 sequential_4[1][0] \n", - "==================================================================================================\n", - "Total params: 321,703,403\n", - "Trainable params: 381,803\n", - "Non-trainable params: 321,321,600\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "K.clear_session()\n", - "m = build_model(sem_vectors, 50, 200, 3, 200)\n", - "m.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of trainable parameters, ~381k, is the number given by Parikh et al, so we're on the right track." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Parikh et al use tiny batches of 4, training for 50MM batches, which amounts to around 500 epochs. Here we'll use large batches to better use the GPU, and train for fewer epochs -- for purposes of this experiment." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 549367 samples, validate on 9824 samples\n", - "Epoch 1/50\n", - "549367/549367 [==============================] - 34s 62us/step - loss: 0.7599 - acc: 0.6617 - val_loss: 0.5396 - val_acc: 0.7861\n", - "Epoch 2/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.5611 - acc: 0.7763 - val_loss: 0.4892 - val_acc: 0.8085\n", - "Epoch 3/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.5212 - acc: 0.7948 - val_loss: 0.4574 - val_acc: 0.8261\n", - "Epoch 4/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4986 - acc: 0.8045 - val_loss: 0.4410 - val_acc: 0.8274\n", - "Epoch 5/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4819 - acc: 0.8114 - val_loss: 0.4224 - val_acc: 0.8383\n", - "Epoch 6/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4714 - acc: 0.8166 - val_loss: 0.4200 - val_acc: 0.8379\n", - "Epoch 7/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4633 - acc: 0.8203 - val_loss: 0.4098 - val_acc: 0.8457\n", - "Epoch 8/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4558 - acc: 0.8232 - val_loss: 0.4114 - val_acc: 0.8415\n", - "Epoch 9/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4508 - acc: 0.8250 - val_loss: 0.4062 - val_acc: 0.8477\n", - "Epoch 10/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4433 - acc: 0.8286 - val_loss: 0.3982 - val_acc: 0.8486\n", - "Epoch 11/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4388 - acc: 0.8307 - val_loss: 0.3953 - val_acc: 0.8497\n", - "Epoch 12/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4351 - acc: 0.8321 - val_loss: 0.3973 - val_acc: 0.8522\n", - "Epoch 13/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4309 - acc: 0.8342 - val_loss: 0.3939 - val_acc: 0.8539\n", - "Epoch 14/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4269 - acc: 0.8355 - val_loss: 0.3932 - val_acc: 0.8517\n", - "Epoch 15/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4247 - acc: 0.8369 - val_loss: 0.3938 - val_acc: 0.8515\n", - "Epoch 16/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4208 - acc: 0.8379 - val_loss: 0.3936 - val_acc: 0.8504\n", - "Epoch 17/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4194 - acc: 0.8390 - val_loss: 0.3885 - val_acc: 0.8560\n", - "Epoch 18/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4162 - acc: 0.8402 - val_loss: 0.3874 - val_acc: 0.8561\n", - "Epoch 19/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4140 - acc: 0.8409 - val_loss: 0.3889 - val_acc: 0.8545\n", - "Epoch 20/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4114 - acc: 0.8426 - val_loss: 0.3864 - val_acc: 0.8583\n", - "Epoch 21/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4092 - acc: 0.8430 - val_loss: 0.3870 - val_acc: 0.8561\n", - "Epoch 22/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4062 - acc: 0.8442 - val_loss: 0.3852 - val_acc: 0.8577\n", - "Epoch 23/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4050 - acc: 0.8450 - val_loss: 0.3850 - val_acc: 0.8578\n", - "Epoch 24/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4035 - acc: 0.8455 - val_loss: 0.3825 - val_acc: 0.8555\n", - "Epoch 25/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.4018 - acc: 0.8460 - val_loss: 0.3837 - val_acc: 0.8573\n", - "Epoch 26/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3989 - acc: 0.8476 - val_loss: 0.3843 - val_acc: 0.8599\n", - "Epoch 27/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3979 - acc: 0.8481 - val_loss: 0.3841 - val_acc: 0.8589\n", - "Epoch 28/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3967 - acc: 0.8484 - val_loss: 0.3811 - val_acc: 0.8575\n", - "Epoch 29/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3956 - acc: 0.8492 - val_loss: 0.3829 - val_acc: 0.8589\n", - "Epoch 30/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3938 - acc: 0.8499 - val_loss: 0.3859 - val_acc: 0.8562\n", - "Epoch 31/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3925 - acc: 0.8500 - val_loss: 0.3798 - val_acc: 0.8587\n", - "Epoch 32/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3906 - acc: 0.8509 - val_loss: 0.3834 - val_acc: 0.8569\n", - "Epoch 33/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3893 - acc: 0.8511 - val_loss: 0.3806 - val_acc: 0.8588\n", - "Epoch 34/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3885 - acc: 0.8515 - val_loss: 0.3828 - val_acc: 0.8603\n", - "Epoch 35/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3879 - acc: 0.8520 - val_loss: 0.3800 - val_acc: 0.8594\n", - "Epoch 36/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3860 - acc: 0.8530 - val_loss: 0.3796 - val_acc: 0.8577\n", - "Epoch 37/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3856 - acc: 0.8532 - val_loss: 0.3857 - val_acc: 0.8591\n", - "Epoch 38/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3838 - acc: 0.8535 - val_loss: 0.3835 - val_acc: 0.8603\n", - "Epoch 39/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3830 - acc: 0.8543 - val_loss: 0.3830 - val_acc: 0.8599\n", - "Epoch 40/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3818 - acc: 0.8548 - val_loss: 0.3832 - val_acc: 0.8559\n", - "Epoch 41/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3806 - acc: 0.8551 - val_loss: 0.3845 - val_acc: 0.8553\n", - "Epoch 42/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3803 - acc: 0.8550 - val_loss: 0.3789 - val_acc: 0.8617\n", - "Epoch 43/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3791 - acc: 0.8556 - val_loss: 0.3835 - val_acc: 0.8580\n", - "Epoch 44/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3778 - acc: 0.8565 - val_loss: 0.3799 - val_acc: 0.8580\n", - "Epoch 45/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3766 - acc: 0.8571 - val_loss: 0.3790 - val_acc: 0.8625\n", - "Epoch 46/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3770 - acc: 0.8569 - val_loss: 0.3820 - val_acc: 0.8590\n", - "Epoch 47/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3761 - acc: 0.8573 - val_loss: 0.3831 - val_acc: 0.8581\n", - "Epoch 48/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3739 - acc: 0.8579 - val_loss: 0.3828 - val_acc: 0.8599\n", - "Epoch 49/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3738 - acc: 0.8577 - val_loss: 0.3785 - val_acc: 0.8590\n", - "Epoch 50/50\n", - "549367/549367 [==============================] - 33s 60us/step - loss: 0.3726 - acc: 0.8580 - val_loss: 0.3820 - val_acc: 0.8585\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result is broadly in the region reported by Parikh et al: ~86 vs 86.3%. The small difference might be accounted by differences in `max_length` (here set at 50), in the training regime, and that here we use Keras' built-in validation splitting rather than the SNLI test set." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment: the asymmetric model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It was suggested earlier that, based on the semantics of entailment, the vector representing the strength of association between the hypothesis to the text is all that is needed for classifying the entailment.\n", - "\n", - "The following model removes consideration of the complementary vector (text to hypothesis) from the computation. This will decrease the paramater count slightly, because the final dense layers will be smaller, and speed up the forward pass when predicting, because fewer calculations will be needed." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "words2 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "words1 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "sequential_5 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n", - " words2[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_6 (Sequential) (None, 50, 200) 80400 sequential_5[1][0] \n", - " sequential_5[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_4 (Dot) (None, 50, 50) 0 sequential_6[1][0] \n", - " sequential_6[2][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_5 (Lambda) (None, 50, 50) 0 dot_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "dot_5 (Dot) (None, 50, 200) 0 lambda_5[0][0] \n", - " sequential_5[1][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_4 (Concatenate) (None, 50, 400) 0 sequential_5[2][0] \n", - " dot_5[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_5 (TimeDistrib (None, 50, 200) 120400 concatenate_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_6 (Lambda) (None, 200) 0 time_distributed_5[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_8 (Sequential) (None, 200) 80400 lambda_6[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_16 (Dense) (None, 3) 603 sequential_8[1][0] \n", - "==================================================================================================\n", - "Total params: 321,663,403\n", - "Trainable params: 341,803\n", - "Non-trainable params: 321,321,600\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "m1 = build_model(sem_vectors, 50, 200, 3, 200, 'left')\n", - "m1.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The parameter count has indeed decreased by 40,000, corresponding to the 200x200 smaller H function." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 549367 samples, validate on 9824 samples\n", - "Epoch 1/50\n", - "549367/549367 [==============================] - 25s 46us/step - loss: 0.7331 - acc: 0.6770 - val_loss: 0.5257 - val_acc: 0.7936\n", - "Epoch 2/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.5518 - acc: 0.7799 - val_loss: 0.4717 - val_acc: 0.8159\n", - "Epoch 3/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.5147 - acc: 0.7967 - val_loss: 0.4449 - val_acc: 0.8278\n", - "Epoch 4/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4948 - acc: 0.8060 - val_loss: 0.4326 - val_acc: 0.8344\n", - "Epoch 5/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4814 - acc: 0.8122 - val_loss: 0.4247 - val_acc: 0.8359\n", - "Epoch 6/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4712 - acc: 0.8162 - val_loss: 0.4143 - val_acc: 0.8430\n", - "Epoch 7/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4635 - acc: 0.8205 - val_loss: 0.4172 - val_acc: 0.8401\n", - "Epoch 8/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4570 - acc: 0.8223 - val_loss: 0.4106 - val_acc: 0.8422\n", - "Epoch 9/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4505 - acc: 0.8259 - val_loss: 0.4043 - val_acc: 0.8451\n", - "Epoch 10/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4459 - acc: 0.8280 - val_loss: 0.4050 - val_acc: 0.8467\n", - "Epoch 11/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4405 - acc: 0.8300 - val_loss: 0.3975 - val_acc: 0.8481\n", - "Epoch 12/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4360 - acc: 0.8324 - val_loss: 0.4026 - val_acc: 0.8496\n", - "Epoch 13/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4327 - acc: 0.8334 - val_loss: 0.4024 - val_acc: 0.8471\n", - "Epoch 14/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4293 - acc: 0.8350 - val_loss: 0.3955 - val_acc: 0.8496\n", - "Epoch 15/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4263 - acc: 0.8369 - val_loss: 0.3980 - val_acc: 0.8490\n", - "Epoch 16/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4236 - acc: 0.8377 - val_loss: 0.3958 - val_acc: 0.8496\n", - "Epoch 17/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4213 - acc: 0.8384 - val_loss: 0.3954 - val_acc: 0.8496\n", - "Epoch 18/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4187 - acc: 0.8394 - val_loss: 0.3929 - val_acc: 0.8514\n", - "Epoch 19/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4157 - acc: 0.8409 - val_loss: 0.3939 - val_acc: 0.8507\n", - "Epoch 20/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4135 - acc: 0.8417 - val_loss: 0.3953 - val_acc: 0.8522\n", - "Epoch 21/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4122 - acc: 0.8424 - val_loss: 0.3974 - val_acc: 0.8506\n", - "Epoch 22/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4099 - acc: 0.8435 - val_loss: 0.3918 - val_acc: 0.8522\n", - "Epoch 23/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4075 - acc: 0.8443 - val_loss: 0.3901 - val_acc: 0.8513\n", - "Epoch 24/50\n", - "549367/549367 [==============================] - 24s 44us/step - loss: 0.4067 - acc: 0.8447 - val_loss: 0.3885 - val_acc: 0.8543\n", - "Epoch 25/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4047 - acc: 0.8454 - val_loss: 0.3846 - val_acc: 0.8531\n", - "Epoch 26/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.4031 - acc: 0.8461 - val_loss: 0.3864 - val_acc: 0.8562\n", - "Epoch 27/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4020 - acc: 0.8467 - val_loss: 0.3874 - val_acc: 0.8546\n", - "Epoch 28/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.4001 - acc: 0.8473 - val_loss: 0.3848 - val_acc: 0.8534\n", - "Epoch 29/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3991 - acc: 0.8479 - val_loss: 0.3865 - val_acc: 0.8562\n", - "Epoch 30/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3976 - acc: 0.8484 - val_loss: 0.3833 - val_acc: 0.8574\n", - "Epoch 31/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3961 - acc: 0.8487 - val_loss: 0.3846 - val_acc: 0.8585\n", - "Epoch 32/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3942 - acc: 0.8498 - val_loss: 0.3805 - val_acc: 0.8573\n", - "Epoch 33/50\n", - "549367/549367 [==============================] - 24s 44us/step - loss: 0.3935 - acc: 0.8503 - val_loss: 0.3856 - val_acc: 0.8579\n", - "Epoch 34/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3923 - acc: 0.8507 - val_loss: 0.3829 - val_acc: 0.8560\n", - "Epoch 35/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3920 - acc: 0.8508 - val_loss: 0.3864 - val_acc: 0.8575\n", - "Epoch 36/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3907 - acc: 0.8516 - val_loss: 0.3873 - val_acc: 0.8563\n", - "Epoch 37/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3891 - acc: 0.8519 - val_loss: 0.3850 - val_acc: 0.8570\n", - "Epoch 38/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3872 - acc: 0.8522 - val_loss: 0.3815 - val_acc: 0.8591\n", - "Epoch 39/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3887 - acc: 0.8520 - val_loss: 0.3829 - val_acc: 0.8590\n", - "Epoch 40/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3868 - acc: 0.8531 - val_loss: 0.3807 - val_acc: 0.8600\n", - "Epoch 41/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3859 - acc: 0.8537 - val_loss: 0.3832 - val_acc: 0.8574\n", - "Epoch 42/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3849 - acc: 0.8537 - val_loss: 0.3850 - val_acc: 0.8576\n", - "Epoch 43/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3834 - acc: 0.8541 - val_loss: 0.3825 - val_acc: 0.8563\n", - "Epoch 44/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3829 - acc: 0.8548 - val_loss: 0.3844 - val_acc: 0.8540\n", - "Epoch 45/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8552 - val_loss: 0.3841 - val_acc: 0.8559\n", - "Epoch 46/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8549 - val_loss: 0.3880 - val_acc: 0.8567\n", - "Epoch 47/50\n", - "549367/549367 [==============================] - 24s 45us/step - loss: 0.3799 - acc: 0.8559 - val_loss: 0.3767 - val_acc: 0.8635\n", - "Epoch 48/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3800 - acc: 0.8560 - val_loss: 0.3786 - val_acc: 0.8563\n", - "Epoch 49/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3781 - acc: 0.8563 - val_loss: 0.3812 - val_acc: 0.8596\n", - "Epoch 50/50\n", - "549367/549367 [==============================] - 25s 45us/step - loss: 0.3788 - acc: 0.8560 - val_loss: 0.3782 - val_acc: 0.8601\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m1.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This model performs the same as the slightly more complex model that evaluates alignments in both directions. Note also that processing time is improved, from 64 down to 48 microseconds per step. \n", - "\n", - "Let's now look at an asymmetric model that evaluates text to hypothesis comparisons. The prediction is that such a model will correctly classify a decent proportion of the exemplars, but not as accurately as the previous two.\n", - "\n", - "We'll just use 10 epochs for expediency." - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "words1 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "words2 (InputLayer) (None, 50) 0 \n", - "__________________________________________________________________________________________________\n", - "sequential_13 (Sequential) (None, 50, 200) 321381600 words1[0][0] \n", - " words2[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_14 (Sequential) (None, 50, 200) 80400 sequential_13[1][0] \n", - " sequential_13[2][0] \n", - "__________________________________________________________________________________________________\n", - "dot_8 (Dot) (None, 50, 50) 0 sequential_14[1][0] \n", - " sequential_14[2][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_9 (Lambda) (None, 50, 50) 0 dot_8[0][0] \n", - "__________________________________________________________________________________________________\n", - "dot_9 (Dot) (None, 50, 200) 0 lambda_9[0][0] \n", - " sequential_13[2][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_6 (Concatenate) (None, 50, 400) 0 sequential_13[1][0] \n", - " dot_9[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_9 (TimeDistrib (None, 50, 200) 120400 concatenate_6[0][0] \n", - "__________________________________________________________________________________________________\n", - "lambda_10 (Lambda) (None, 200) 0 time_distributed_9[0][0] \n", - "__________________________________________________________________________________________________\n", - "sequential_16 (Sequential) (None, 200) 80400 lambda_10[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_32 (Dense) (None, 3) 603 sequential_16[1][0] \n", - "==================================================================================================\n", - "Total params: 321,663,403\n", - "Trainable params: 341,803\n", - "Non-trainable params: 321,321,600\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "m2 = build_model(sem_vectors, 50, 200, 3, 200, 'right')\n", - "m2.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 455226 samples, validate on 113807 samples\n", - "Epoch 1/10\n", - "455226/455226 [==============================] - 22s 49us/step - loss: 0.8920 - acc: 0.5771 - val_loss: 0.8001 - val_acc: 0.6435\n", - "Epoch 2/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.7808 - acc: 0.6553 - val_loss: 0.7267 - val_acc: 0.6855\n", - "Epoch 3/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.7329 - acc: 0.6825 - val_loss: 0.6966 - val_acc: 0.7006\n", - "Epoch 4/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.7055 - acc: 0.6978 - val_loss: 0.6713 - val_acc: 0.7150\n", - "Epoch 5/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6862 - acc: 0.7081 - val_loss: 0.6533 - val_acc: 0.7253\n", - "Epoch 6/10\n", - "455226/455226 [==============================] - 21s 47us/step - loss: 0.6694 - acc: 0.7179 - val_loss: 0.6472 - val_acc: 0.7277\n", - "Epoch 7/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6555 - acc: 0.7252 - val_loss: 0.6338 - val_acc: 0.7347\n", - "Epoch 8/10\n", - "455226/455226 [==============================] - 22s 48us/step - loss: 0.6434 - acc: 0.7310 - val_loss: 0.6246 - val_acc: 0.7385\n", - "Epoch 9/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6325 - acc: 0.7367 - val_loss: 0.6164 - val_acc: 0.7424\n", - "Epoch 10/10\n", - "455226/455226 [==============================] - 22s 47us/step - loss: 0.6216 - acc: 0.7426 - val_loss: 0.6082 - val_acc: 0.7478\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m2.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=10,validation_split=.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Comparing this fit to the validation accuracy of the previous two models after 10 epochs, we observe that its accuracy is roughly 10% lower.\n", - "\n", - "It is reassuring that the neural modeling here reproduces what we know from the semantics of natural language!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py deleted file mode 100644 index 7f97bc1c3..000000000 --- a/examples/pipeline/custom_attr_methods.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -"""This example contains several snippets of methods that can be set via custom -Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like -they're "bound" to the object and are partially applied – i.e. the object -they're called on is passed in as the first argument. - -* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -from spacy.lang.en import English -from spacy.tokens import Doc, Span -from spacy import displacy -from pathlib import Path - - -@plac.annotations( - output_dir=("Output directory for saved HTML", "positional", None, Path) -) -def main(output_dir=None): - nlp = English() # start off with blank English class - - Doc.set_extension("overlap", method=overlap_tokens) - doc1 = nlp("Peach emoji is where it has always been.") - doc2 = nlp("Peach is the superior emoji.") - print("Text 1:", doc1.text) - print("Text 2:", doc2.text) - print("Overlapping tokens:", doc1._.overlap(doc2)) - - Doc.set_extension("to_html", method=to_html) - doc = nlp("This is a sentence about Apple.") - # add entity manually for demo purposes, to make it work without a model - doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])] - print("Text:", doc.text) - doc._.to_html(output=output_dir, style="ent") - - -def to_html(doc, output="/tmp", style="dep"): - """Doc method extension for saving the current state as a displaCy - visualization. - """ - # generate filename from first six non-punct tokens - file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html" - html = displacy.render(doc, style=style, page=True) # render markup - if output is not None: - output_path = Path(output) - if not output_path.exists(): - output_path.mkdir() - output_file = Path(output) / file_name - output_file.open("w", encoding="utf-8").write(html) # save to file - print("Saved HTML to {}".format(output_file)) - else: - print(html) - - -def overlap_tokens(doc, other_doc): - """Get the tokens from the original Doc that are also in the comparison Doc. - """ - overlap = [] - other_tokens = [token.text for token in other_doc] - for token in doc: - if token.text in other_tokens: - overlap.append(token) - return overlap - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Text 1: Peach emoji is where it has always been. - # Text 2: Peach is the superior emoji. - # Overlapping tokens: [Peach, emoji, is, .] diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py deleted file mode 100644 index 241c0af37..000000000 --- a/examples/pipeline/custom_component_countries_api.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of a spaCy v2.0 pipeline component that requests all countries via -the REST Countries API, merges country names into one token, assigns entity -labels and sets attributes on country tokens, e.g. the capital and lat/lng -coordinates. Can be extended with more details from the API. - -* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0) -* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -Prerequisites: pip install requests -""" -from __future__ import unicode_literals, print_function - -import requests -import plac -from spacy.lang.en import English -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc, Span, Token - - -def main(): - # For simplicity, we start off with only the blank English Language class - # and no model or pre-defined pipeline loaded. - nlp = English() - rest_countries = RESTCountriesComponent(nlp) # initialise component - nlp.add_pipe(rest_countries) # add it to the pipeline - doc = nlp("Some text about Colombia and the Czech Republic") - print("Pipeline", nlp.pipe_names) # pipeline contains component name - print("Doc has countries", doc._.has_country) # Doc contains countries - for token in doc: - if token._.is_country: - print( - token.text, - token._.country_capital, - token._.country_latlng, - token._.country_flag, - ) # country data - print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities - - -class RESTCountriesComponent(object): - """spaCy v2.0 pipeline component that requests all countries via - the REST Countries API, merges country names into one token, assigns entity - labels and sets attributes on country tokens. - """ - - name = "rest_countries" # component name, will show up in the pipeline - - def __init__(self, nlp, label="GPE"): - """Initialise the pipeline component. The shared nlp instance is used - to initialise the matcher with the shared vocab, get the label ID and - generate Doc objects as phrase match patterns. - """ - # Make request once on initialisation and store the data - r = requests.get("https://restcountries.eu/rest/v2/all") - r.raise_for_status() # make sure requests raises an error if it fails - countries = r.json() - - # Convert API response to dict keyed by country name for easy lookup - # This could also be extended using the alternative and foreign language - # names provided by the API - self.countries = {c["name"]: c for c in countries} - self.label = nlp.vocab.strings[label] # get entity label ID - - # Set up the PhraseMatcher with Doc patterns for each country name - patterns = [nlp(c) for c in self.countries.keys()] - self.matcher = PhraseMatcher(nlp.vocab) - self.matcher.add("COUNTRIES", None, *patterns) - - # Register attribute on the Token. We'll be overwriting this based on - # the matches, so we're only setting a default value, not a getter. - # If no default value is set, it defaults to None. - Token.set_extension("is_country", default=False) - Token.set_extension("country_capital", default=False) - Token.set_extension("country_latlng", default=False) - Token.set_extension("country_flag", default=False) - - # Register attributes on Doc and Span via a getter that checks if one of - # the contained tokens is set to is_country == True. - Doc.set_extension("has_country", getter=self.has_country) - Span.set_extension("has_country", getter=self.has_country) - - def __call__(self, doc): - """Apply the pipeline component on a Doc object and modify it if matches - are found. Return the Doc, so it can be processed by the next component - in the pipeline, if available. - """ - matches = self.matcher(doc) - spans = [] # keep the spans for later so we can merge them afterwards - for _, start, end in matches: - # Generate Span representing the entity & set label - entity = Span(doc, start, end, label=self.label) - spans.append(entity) - # Set custom attribute on each token of the entity - # Can be extended with other data returned by the API, like - # currencies, country code, flag, calling code etc. - for token in entity: - token._.set("is_country", True) - token._.set("country_capital", self.countries[entity.text]["capital"]) - token._.set("country_latlng", self.countries[entity.text]["latlng"]) - token._.set("country_flag", self.countries[entity.text]["flag"]) - # Overwrite doc.ents and add entity – be careful not to replace! - doc.ents = list(doc.ents) + [entity] - for span in spans: - # Iterate over all spans and merge them into one token. This is done - # after setting the entities – otherwise, it would cause mismatched - # indices! - span.merge() - return doc # don't forget to return the Doc! - - def has_country(self, tokens): - """Getter for Doc and Span attributes. Returns True if one of the tokens - is a country. Since the getter is only called when we access the - attribute, we can refer to the Token's 'is_country' attribute here, - which is already set in the processing step.""" - return any([t._.get("is_country") for t in tokens]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Pipeline ['rest_countries'] - # Doc has countries True - # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg - # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg - # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')] diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py deleted file mode 100644 index a53b688b0..000000000 --- a/examples/pipeline/custom_component_entities.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of a spaCy v2.0 pipeline component that sets entity annotations -based on list of single or multiple-word company names. Companies are -labelled as ORG and their spans are merged into one token. Additionally, -._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token -respectively. - -* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -from spacy.lang.en import English -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc, Span, Token - - -@plac.annotations( - text=("Text to process", "positional", None, str), - companies=("Names of technology companies", "positional", None, str), -) -def main(text="Alphabet Inc. is the company behind Google.", *companies): - # For simplicity, we start off with only the blank English Language class - # and no model or pre-defined pipeline loaded. - nlp = English() - if not companies: # set default companies if none are set via args - companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"] # etc. - component = TechCompanyRecognizer(nlp, companies) # initialise component - nlp.add_pipe(component, last=True) # add last to the pipeline - - doc = nlp(text) - print("Pipeline", nlp.pipe_names) # pipeline contains component name - print("Tokens", [t.text for t in doc]) # company names from the list are merged - print("Doc has_tech_org", doc._.has_tech_org) # Doc contains tech orgs - print("Token 0 is_tech_org", doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org - print("Token 1 is_tech_org", doc[1]._.is_tech_org) # "is" is not - print("Entities", [(e.text, e.label_) for e in doc.ents]) # all orgs are entities - - -class TechCompanyRecognizer(object): - """Example of a spaCy v2.0 pipeline component that sets entity annotations - based on list of single or multiple-word company names. Companies are - labelled as ORG and their spans are merged into one token. Additionally, - ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token - respectively.""" - - name = "tech_companies" # component name, will show up in the pipeline - - def __init__(self, nlp, companies=tuple(), label="ORG"): - """Initialise the pipeline component. The shared nlp instance is used - to initialise the matcher with the shared vocab, get the label ID and - generate Doc objects as phrase match patterns. - """ - self.label = nlp.vocab.strings[label] # get entity label ID - - # Set up the PhraseMatcher – it can now take Doc objects as patterns, - # so even if the list of companies is long, it's very efficient - patterns = [nlp(org) for org in companies] - self.matcher = PhraseMatcher(nlp.vocab) - self.matcher.add("TECH_ORGS", None, *patterns) - - # Register attribute on the Token. We'll be overwriting this based on - # the matches, so we're only setting a default value, not a getter. - Token.set_extension("is_tech_org", default=False) - - # Register attributes on Doc and Span via a getter that checks if one of - # the contained tokens is set to is_tech_org == True. - Doc.set_extension("has_tech_org", getter=self.has_tech_org) - Span.set_extension("has_tech_org", getter=self.has_tech_org) - - def __call__(self, doc): - """Apply the pipeline component on a Doc object and modify it if matches - are found. Return the Doc, so it can be processed by the next component - in the pipeline, if available. - """ - matches = self.matcher(doc) - spans = [] # keep the spans for later so we can merge them afterwards - for _, start, end in matches: - # Generate Span representing the entity & set label - entity = Span(doc, start, end, label=self.label) - spans.append(entity) - # Set custom attribute on each token of the entity - for token in entity: - token._.set("is_tech_org", True) - # Overwrite doc.ents and add entity – be careful not to replace! - doc.ents = list(doc.ents) + [entity] - for span in spans: - # Iterate over all spans and merge them into one token. This is done - # after setting the entities – otherwise, it would cause mismatched - # indices! - span.merge() - return doc # don't forget to return the Doc! - - def has_tech_org(self, tokens): - """Getter for Doc and Span attributes. Returns True if one of the tokens - is a tech org. Since the getter is only called when we access the - attribute, we can refer to the Token's 'is_tech_org' attribute here, - which is already set in the processing step.""" - return any([t._.get("is_tech_org") for t in tokens]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Pipeline ['tech_companies'] - # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.'] - # Doc has_tech_org True - # Token 0 is_tech_org True - # Token 1 is_tech_org False - # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')] diff --git a/examples/pipeline/custom_sentence_segmentation.py b/examples/pipeline/custom_sentence_segmentation.py deleted file mode 100644 index ff59ab187..000000000 --- a/examples/pipeline/custom_sentence_segmentation.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Example of adding a pipeline component to prohibit sentence boundaries -before certain tokens. - -What we do is write to the token.is_sent_start attribute, which -takes values in {True, False, None}. The default value None allows the parser -to predict sentence segments. The value False prohibits the parser from inserting -a sentence boundary before that token. Note that fixing the sentence segmentation -should also improve the parse quality. - -The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627 -Other versions of the model may not make the original mistake, so the specific -example might not be apt for future versions. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -import plac -import spacy - - -def prevent_sentence_boundaries(doc): - for token in doc: - if not can_be_sentence_start(token): - token.is_sent_start = False - return doc - - -def can_be_sentence_start(token): - if token.i == 0: - return True - # We're not checking for is_title here to ignore arbitrary titlecased - # tokens within sentences - # elif token.is_title: - # return True - elif token.nbor(-1).is_punct: - return True - elif token.nbor(-1).is_space: - return True - else: - return False - - -@plac.annotations( - text=("The raw text to process", "positional", None, str), - spacy_model=("spaCy model to use (with a parser)", "option", "m", str), -) -def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"): - print("Using spaCy model '{}'".format(spacy_model)) - print("Processing text '{}'".format(text)) - nlp = spacy.load(spacy_model) - doc = nlp(text) - sentences = [sent.text.strip() for sent in doc.sents] - print("Before:", sentences) - nlp.add_pipe(prevent_sentence_boundaries, before="parser") - doc = nlp(text) - sentences = [sent.text.strip() for sent in doc.sents] - print("After:", sentences) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/pipeline/fix_space_entities.py b/examples/pipeline/fix_space_entities.py deleted file mode 100644 index 686253eca..000000000 --- a/examples/pipeline/fix_space_entities.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Demonstrate adding a rule-based component that forces some tokens to not -be entities, before the NER tagger is applied. This is used to hotfix the issue -in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals - -import spacy -from spacy.attrs import ENT_IOB - - -def fix_space_tags(doc): - ent_iobs = doc.to_array([ENT_IOB]) - for i, token in enumerate(doc): - if token.is_space: - # Sets 'O' tag (0 is None, so I is 1, O is 2) - ent_iobs[i] = 2 - doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1))) - return doc - - -def main(): - nlp = spacy.load("en_core_web_sm") - text = "This is some crazy test where I dont need an Apple Watch to make things bug" - doc = nlp(text) - print("Before", doc.ents) - nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner") - doc = nlp(text) - print("After", doc.ents) - - -if __name__ == "__main__": - main() diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py deleted file mode 100644 index e4aca7912..000000000 --- a/examples/pipeline/multi_processing.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of multi-processing with Joblib. Here, we're exporting -part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with -each "sentence" on a newline, and spaces between tokens. Data is loaded from -the IMDB movie reviews dataset and will be loaded automatically via Thinc's -built-in dataset loader. - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -Prerequisites: pip install joblib -""" -from __future__ import print_function, unicode_literals - -from pathlib import Path - -import ml_datasets -from joblib import Parallel, delayed -from functools import partial -import plac -import spacy -from spacy.util import minibatch - - -@plac.annotations( - output_dir=("Output directory", "positional", None, Path), - model=("Model name (needs tagger)", "positional", None, str), - n_jobs=("Number of workers", "option", "n", int), - batch_size=("Batch-size for each process", "option", "b", int), - limit=("Limit of entries from the dataset", "option", "l", int), -) -def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000): - nlp = spacy.load(model) # load spaCy model - print("Loaded model '%s'" % model) - if not output_dir.exists(): - output_dir.mkdir() - # load and pre-process the IMBD dataset - print("Loading IMDB data...") - data, _ = ml_datasets.imdb() - texts, _ = zip(*data[-limit:]) - print("Processing texts...") - partitions = minibatch(texts, size=batch_size) - executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes") - do = delayed(partial(transform_texts, nlp)) - tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions)) - executor(tasks) - - -def transform_texts(nlp, batch_id, texts, output_dir): - print(nlp.pipe_names) - out_path = Path(output_dir) / ("%d.txt" % batch_id) - if out_path.exists(): # return None in case same batch is called again - return None - print("Processing batch", batch_id) - with out_path.open("w", encoding="utf8") as f: - for doc in nlp.pipe(texts): - f.write(" ".join(represent_word(w) for w in doc if not w.is_space)) - f.write("\n") - print("Saved {} texts to {}.txt".format(len(texts), batch_id)) - - -def represent_word(word): - text = word.text - # True-case, i.e. try to normalize sentence-initial capitals. - # Only do this if the lower-cased form is more probable. - if ( - text.istitle() - and is_sent_begin(word) - and word.prob < word.doc.vocab[text.lower()].prob - ): - text = text.lower() - return text + "|" + word.tag_ - - -def is_sent_begin(word): - if word.i == 0: - return True - elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."): - return True - else: - return False - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py deleted file mode 100644 index 2b527b3df..000000000 --- a/examples/streamlit_spacy.py +++ /dev/null @@ -1,165 +0,0 @@ -# coding: utf-8 -""" -Example of a Streamlit app for an interactive spaCy model visualizer. You can -either download the script, or point `streamlit run` to the raw URL of this -file. For more details, see https://streamlit.io. - -Installation: -pip install streamlit -python -m spacy download en_core_web_sm -python -m spacy download en_core_web_md -python -m spacy download de_core_news_sm - -Usage: -streamlit run streamlit_spacy.py -""" -from __future__ import unicode_literals - -import base64 - -import streamlit as st -import spacy -from spacy import displacy -import pandas as pd - - -SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"] -DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." -HTML_WRAPPER = """
{}
""" - - -@st.cache(allow_output_mutation=True) -def load_model(name): - return spacy.load(name) - - -@st.cache(allow_output_mutation=True) -def process_text(model_name, text): - nlp = load_model(model_name) - return nlp(text) - - -st.sidebar.title("Interactive spaCy visualizer") -st.sidebar.markdown( - """ -Process text with [spaCy](https://spacy.io) models and visualize named entities, -dependencies and more. Uses spaCy's built-in -[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood. -""" -) - -spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES) -model_load_state = st.info(f"Loading model '{spacy_model}'...") -nlp = load_model(spacy_model) -model_load_state.empty() - -text = st.text_area("Text to analyze", DEFAULT_TEXT) -doc = process_text(spacy_model, text) - - -def render_svg(svg): - """Renders the given svg string.""" - b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8") - html = r'' % b64 - st.write(html, unsafe_allow_html=True) - - -if "parser" in nlp.pipe_names: - st.header("Dependency Parse & Part-of-speech tags") - st.sidebar.header("Dependency Parse") - split_sents = st.sidebar.checkbox("Split sentences", value=True) - collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True) - collapse_phrases = st.sidebar.checkbox("Collapse phrases") - compact = st.sidebar.checkbox("Compact mode") - options = { - "collapse_punct": collapse_punct, - "collapse_phrases": collapse_phrases, - "compact": compact, - } - docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] - for sent in docs: - html = displacy.render(sent, options=options, style="dep") - # Double newlines seem to mess with the rendering - html = html.replace("\n\n", "\n") - if split_sents and len(docs) > 1: - st.markdown(f"> {sent.text}") - render_svg(html) - # this didn't show the dep arc labels properly, cf #5089 - # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) - -if "ner" in nlp.pipe_names: - st.header("Named Entities") - st.sidebar.header("Named Entities") - label_set = nlp.get_pipe("ner").labels - labels = st.sidebar.multiselect( - "Entity labels", options=label_set, default=list(label_set) - ) - html = displacy.render(doc, style="ent", options={"ents": labels}) - # Newlines seem to mess with the rendering - html = html.replace("\n", " ") - st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) - attrs = ["text", "label_", "start", "end", "start_char", "end_char"] - if "entity_linker" in nlp.pipe_names: - attrs.append("kb_id_") - data = [ - [str(getattr(ent, attr)) for attr in attrs] - for ent in doc.ents - if ent.label_ in labels - ] - df = pd.DataFrame(data, columns=attrs) - st.dataframe(df) - - -if "textcat" in nlp.pipe_names: - st.header("Text Classification") - st.markdown(f"> {text}") - df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score")) - st.dataframe(df) - - -vector_size = nlp.meta.get("vectors", {}).get("width", 0) -if vector_size: - st.header("Vectors & Similarity") - st.code(nlp.meta["vectors"]) - text1 = st.text_input("Text or word 1", "apple") - text2 = st.text_input("Text or word 2", "orange") - doc1 = process_text(spacy_model, text1) - doc2 = process_text(spacy_model, text2) - similarity = doc1.similarity(doc2) - if similarity > 0.5: - st.success(similarity) - else: - st.error(similarity) - -st.header("Token attributes") - -if st.button("Show token attributes"): - attrs = [ - "idx", - "text", - "lemma_", - "pos_", - "tag_", - "dep_", - "head", - "ent_type_", - "ent_iob_", - "shape_", - "is_alpha", - "is_ascii", - "is_digit", - "is_punct", - "like_num", - ] - data = [[str(getattr(token, attr)) for attr in attrs] for token in doc] - df = pd.DataFrame(data, columns=attrs) - st.dataframe(df) - - -st.header("JSON Doc") -if st.button("Show JSON Doc"): - st.json(doc.to_json()) - -st.header("JSON model meta") -if st.button("Show JSON model meta"): - st.json(nlp.meta) diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json deleted file mode 100644 index 9a11dd96b..000000000 --- a/examples/training/conllu-config.json +++ /dev/null @@ -1 +0,0 @@ -{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0} diff --git a/examples/training/conllu.py b/examples/training/conllu.py deleted file mode 100644 index a398b0ae0..000000000 --- a/examples/training/conllu.py +++ /dev/null @@ -1,404 +0,0 @@ -"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes -.conllu format for development data, allowing the official scorer to be used. -""" -from __future__ import unicode_literals -import plac -import attr -from pathlib import Path -import re -import json -import tqdm - -import spacy -import spacy.util -from spacy.tokens import Token, Doc -from spacy.gold import Example -from spacy.pipeline._parser_internals.nonproj import projectivize -from collections import defaultdict -from spacy.matcher import Matcher - -import itertools -import random -import numpy.random - -from bin.ud import conll17_ud_eval - -import spacy.lang.zh -import spacy.lang.ja - -spacy.lang.zh.Chinese.Defaults.use_jieba = False -spacy.lang.ja.Japanese.Defaults.use_janome = False - -random.seed(0) -numpy.random.seed(0) - - -################ -# Data reading # -################ - -space_re = re.compile("\s+") - - -def split_text(text): - return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] - - -def read_data( - nlp, - conllu_file, - text_file, - raw_text=True, - oracle_segments=False, - max_doc_length=None, - limit=None, -): - """Read the CONLLU format into Example objects. If raw_text=True, - include Doc objects created using nlp.make_doc and then aligned against - the gold-standard sequences. If oracle_segments=True, include Doc objects - created from the gold-standard segments. At least one must be True.""" - if not raw_text and not oracle_segments: - raise ValueError("At least one of raw_text or oracle_segments must be True") - paragraphs = split_text(text_file.read()) - conllu = read_conllu(conllu_file) - # sd is spacy doc; cd is conllu doc - # cs is conllu sent, ct is conllu token - docs = [] - golds = [] - for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): - sent_annots = [] - for cs in cd: - sent = defaultdict(list) - for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: - if "." in id_: - continue - if "-" in id_: - continue - id_ = int(id_) - 1 - head = int(head) - 1 if head != "0" else id_ - sent["words"].append(word) - sent["tags"].append(tag) - sent["heads"].append(head) - sent["deps"].append("ROOT" if dep == "root" else dep) - sent["spaces"].append(space_after == "_") - sent["entities"] = ["-"] * len(sent["words"]) - sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) - if oracle_segments: - docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) - golds.append(sent) - - sent_annots.append(sent) - if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: - doc, gold = _make_gold(nlp, None, sent_annots) - sent_annots = [] - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - - if raw_text and sent_annots: - doc, gold = _make_gold(nlp, None, sent_annots) - docs.append(doc) - golds.append(gold) - if limit and len(docs) >= limit: - return golds_to_gold_data(docs, golds) - return golds_to_gold_data(docs, golds) - - -def read_conllu(file_): - docs = [] - sent = [] - doc = [] - for line in file_: - if line.startswith("# newdoc"): - if doc: - docs.append(doc) - doc = [] - elif line.startswith("#"): - continue - elif not line.strip(): - if sent: - doc.append(sent) - sent = [] - else: - sent.append(list(line.strip().split("\t"))) - if len(sent[-1]) != 10: - print(repr(line)) - raise ValueError - if sent: - doc.append(sent) - if doc: - docs.append(doc) - return docs - - -def _make_gold(nlp, text, sent_annots): - # Flatten the conll annotations, and adjust the head indices - gold = defaultdict(list) - for sent in sent_annots: - gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"]) - for field in ["words", "tags", "deps", "entities", "spaces"]: - gold[field].extend(sent[field]) - # Construct text if necessary - assert len(gold["words"]) == len(gold["spaces"]) - if text is None: - text = "".join( - word + " " * space for word, space in zip(gold["words"], gold["spaces"]) - ) - doc = nlp.make_doc(text) - gold.pop("spaces") - return doc, gold - - -############################# -# Data transforms for spaCy # -############################# - - -def golds_to_gold_data(docs, golds): - """Get out the training data format used by begin_training.""" - data = [] - for doc, gold in zip(docs, golds): - example = Example.from_dict(doc, gold) - data.append(example) - return data - - -############## -# Evaluation # -############## - - -def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - with text_loc.open("r", encoding="utf8") as text_file: - texts = split_text(text_file.read()) - docs = list(nlp.pipe(texts)) - with sys_loc.open("w", encoding="utf8") as out_file: - write_conllu(docs, out_file) - with gold_loc.open("r", encoding="utf8") as gold_file: - gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open("r", encoding="utf8") as sys_file: - sys_ud = conll17_ud_eval.load_conllu(sys_file) - scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) - return scores - - -def write_conllu(docs, file_): - merger = Matcher(docs[0].vocab) - merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) - for i, doc in enumerate(docs): - matches = merger(doc) - spans = [doc[start : end + 1] for _, start, end in matches] - offsets = [(span.start_char, span.end_char) for span in spans] - for start_char, end_char in offsets: - doc.merge(start_char, end_char) - file_.write("# newdoc id = {i}\n".format(i=i)) - for j, sent in enumerate(doc.sents): - file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) - file_.write("# text = {text}\n".format(text=sent.text)) - for k, token in enumerate(sent): - file_.write(token._.get_conllu_lines(k) + "\n") - file_.write("\n") - - -def print_progress(itn, losses, ud_scores): - fields = { - "dep_loss": losses.get("parser", 0.0), - "tag_loss": losses.get("tagger", 0.0), - "words": ud_scores["Words"].f1 * 100, - "sents": ud_scores["Sentences"].f1 * 100, - "tags": ud_scores["XPOS"].f1 * 100, - "uas": ud_scores["UAS"].f1 * 100, - "las": ud_scores["LAS"].f1 * 100, - } - header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"] - if itn == 0: - print("\t".join(header)) - tpl = "\t".join( - ( - "{:d}", - "{dep_loss:.1f}", - "{las:.1f}", - "{uas:.1f}", - "{tags:.1f}", - "{sents:.1f}", - "{words:.1f}", - ) - ) - print(tpl.format(itn, **fields)) - - -# def get_sent_conllu(sent, sent_id): -# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] - - -def get_token_conllu(token, i): - if token._.begins_fused: - n = 1 - while token.nbor(n)._.inside_fused: - n += 1 - id_ = "%d-%d" % (i, i + n) - lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"] - else: - lines = [] - if token.head.i == token.i: - head = 0 - else: - head = i + (token.head.i - token.i) + 1 - fields = [ - str(i + 1), - token.text, - token.lemma_, - token.pos_, - token.tag_, - "_", - str(head), - token.dep_.lower(), - "_", - "_", - ] - lines.append("\t".join(fields)) - return "\n".join(lines) - - -################## -# Initialization # -################## - - -def load_nlp(corpus, config): - lang = corpus.split("_")[0] - nlp = spacy.blank(lang) - if config.vectors: - nlp.vocab.from_disk(config.vectors / "vocab") - return nlp - - -def initialize_pipeline(nlp, examples, config): - nlp.add_pipe(nlp.create_pipe("parser")) - if config.multitask_tag: - nlp.parser.add_multitask_objective("tag") - if config.multitask_sent: - nlp.parser.add_multitask_objective("sent_start") - nlp.parser.moves.add_action(2, "subtok") - nlp.add_pipe(nlp.create_pipe("tagger")) - for eg in examples: - for tag in eg.get_aligned("TAG", as_string=True): - if tag is not None: - nlp.tagger.add_label(tag) - # Replace labels that didn't make the frequency cutoff - actions = set(nlp.parser.labels) - label_set = set([act.split("-")[1] for act in actions if "-" in act]) - for eg in examples: - gold = eg.gold - for i, label in enumerate(gold.labels): - if label is not None and label not in label_set: - gold.labels[i] = label.split("||")[0] - return nlp.begin_training(lambda: examples) - - -######################## -# Command line helpers # -######################## - - -@attr.s -class Config(object): - vectors = attr.ib(default=None) - max_doc_length = attr.ib(default=10) - multitask_tag = attr.ib(default=True) - multitask_sent = attr.ib(default=True) - nr_epoch = attr.ib(default=30) - batch_size = attr.ib(default=1000) - dropout = attr.ib(default=0.2) - - @classmethod - def load(cls, loc): - with Path(loc).open("r", encoding="utf8") as file_: - cfg = json.load(file_) - return cls(**cfg) - - -class Dataset(object): - def __init__(self, path, section): - self.path = path - self.section = section - self.conllu = None - self.text = None - for file_path in self.path.iterdir(): - name = file_path.parts[-1] - if section in name and name.endswith("conllu"): - self.conllu = file_path - elif section in name and name.endswith("txt"): - self.text = file_path - if self.conllu is None: - msg = "Could not find .txt file in {path} for {section}" - raise IOError(msg.format(section=section, path=path)) - if self.text is None: - msg = "Could not find .txt file in {path} for {section}" - self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0] - - -class TreebankPaths(object): - def __init__(self, ud_path, treebank, **cfg): - self.train = Dataset(ud_path / treebank, "train") - self.dev = Dataset(ud_path / treebank, "dev") - self.lang = self.train.lang - - -@plac.annotations( - ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - parses_dir=("Directory to write the development parses", "positional", None, Path), - config=("Path to json formatted config file", "positional", None, Config.load), - corpus=( - "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", - "positional", - None, - str, - ), - limit=("Size limit", "option", "n", int), -) -def main(ud_dir, parses_dir, config, corpus, limit=0): - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - - paths = TreebankPaths(ud_dir, corpus) - if not (parses_dir / corpus).exists(): - (parses_dir / corpus).mkdir() - print("Train and evaluate", corpus, "using lang", paths.lang) - nlp = load_nlp(paths.lang, config) - - examples = read_data( - nlp, - paths.train.conllu.open(encoding="utf8"), - paths.train.text.open(encoding="utf8"), - max_doc_length=config.max_doc_length, - limit=limit, - ) - - optimizer = initialize_pipeline(nlp, examples, config) - - for i in range(config.nr_epoch): - batches = spacy.minibatch_by_words(examples, size=config.batch_size) - losses = {} - n_train_words = sum(len(eg.reference.doc) for eg in examples) - with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - for batch in batches: - pbar.update(sum(len(eg.reference.doc) for eg in batch)) - nlp.update( - examples=batch, sgd=optimizer, drop=config.dropout, losses=losses, - ) - - out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) - with nlp.use_params(optimizer.averages): - scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path) - print_progress(i, losses, scores) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py deleted file mode 100644 index a455c8d7e..000000000 --- a/examples/training/create_kb.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 - -"""Example of defining a knowledge base in spaCy, -which is needed to implement entity linking functionality. - -For more details, see the documentation: -* Knowledge base: https://spacy.io/api/kb -* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking - -Compatible with: spaCy v2.2.4 -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -from pathlib import Path - -from spacy.vocab import Vocab -import spacy -from spacy.kb import KnowledgeBase - - -# Q2146908 (Russ Cochran): American golfer -# Q7381115 (Russ Cochran): publisher -ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} - - -@plac.annotations( - model=("Model name, should have pretrained word embeddings", "positional", None, str), - output_dir=("Optional output directory", "option", "o", Path), -) -def main(model, output_dir=None): - """Load the model and create the KB with pre-defined entity encodings. - If an output_dir is provided, the KB will be stored there in a file 'kb'. - The updated vocab will also be written to a directory in the output_dir.""" - - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - - # check the length of the nlp vectors - if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: - raise ValueError( - "The `nlp` object should have access to pretrained word vectors, " - " cf. https://spacy.io/usage/models#languages." - ) - - # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. - # For simplicity, we'll just use the original vector dimension here instead. - vectors_dim = nlp.vocab.vectors.shape[1] - kb = KnowledgeBase(nlp.vocab, entity_vector_length=vectors_dim) - - # set up the data - entity_ids = [] - descr_embeddings = [] - freqs = [] - for key, value in ENTITIES.items(): - desc, freq = value - entity_ids.append(key) - descr_embeddings.append(nlp(desc).vector) - freqs.append(freq) - - # set the entities, can also be done by calling `kb.add_entity` for each entity - kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) - - # adding aliases, the entities need to be defined in the KB beforehand - kb.add_alias( - alias="Russ Cochran", - entities=["Q2146908", "Q7381115"], - probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1 - ) - - # test the trained model - print() - _print_kb(kb) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - kb_path = str(output_dir / "kb") - kb.to_disk(kb_path) - print() - print("Saved KB to", kb_path) - - vocab_path = output_dir / "vocab" - kb.vocab.to_disk(vocab_path) - print("Saved vocab to", vocab_path) - - print() - - # test the saved model - # always reload a knowledge base with the same vocab instance! - print("Loading vocab from", vocab_path) - print("Loading KB from", kb_path) - vocab2 = Vocab().from_disk(vocab_path) - kb2 = KnowledgeBase(vocab2, entity_vector_length=1) - kb2.from_disk(kb_path) - print() - _print_kb(kb2) - - -def _print_kb(kb): - print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings()) - print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings()) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # 2 kb entities: ['Q2146908', 'Q7381115'] - # 1 kb aliases: ['Russ Cochran'] diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py deleted file mode 100644 index baa6d7f06..000000000 --- a/examples/training/ner_multitask_objective.py +++ /dev/null @@ -1,88 +0,0 @@ -"""This example shows how to add a multi-task objective that is trained -alongside the entity recognizer. This is an alternative to adding features -to the model. - -The multi-task idea is to train an auxiliary model to predict some attribute, -with weights shared between the auxiliary model and the main model. In this -example, we're predicting the position of the word in the document. - -The model that predicts the position of the word encourages the convolutional -layers to include the position information in their representation. The -information is then available to the main model, as a feature. - -The overall idea is that we might know something about what sort of features -we'd like the CNN to extract. The multi-task objectives can encourage the -extraction of this type of feature. The multi-task objective is only used -during training. We discard the auxiliary model before run-time. - -The specific example here is not necessarily a good idea --- but it shows -how an arbitrary objective function for some word can be used. - -Developed and tested for spaCy 2.0.6. Updated for v2.2.2 -""" -import random -import plac -import spacy -import os.path - -from spacy.gold.example import Example -from spacy.tokens import Doc -from spacy.gold import read_json_file - -random.seed(0) - -PWD = os.path.dirname(__file__) - -TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json"))) - - -def get_position_label(i, token_annotation): - """Return labels indicating the position of the word in the document. - """ - if len(token_annotation.words) < 20: - return "short-doc" - elif i == 0: - return "first-word" - elif i < 10: - return "early-word" - elif i < 20: - return "mid-word" - elif i == len(token_annotation.words) - 1: - return "last-word" - else: - return "late-word" - - -def main(n_iter=10): - nlp = spacy.blank("en") - ner = nlp.create_pipe("ner") - ner.add_multitask_objective(get_position_label) - nlp.add_pipe(ner) - print(nlp.pipeline) - - print("Create data", len(TRAIN_DATA)) - optimizer = nlp.begin_training() - for itn in range(n_iter): - random.shuffle(TRAIN_DATA) - losses = {} - for example_dict in TRAIN_DATA: - doc = Doc(nlp.vocab, words=example_dict["words"]) - example = Example.from_dict(doc, example_dict) - nlp.update( - examples=[example], # 1 example - drop=0.2, # dropout - make it harder to memorise data - sgd=optimizer, # callable to update weights - losses=losses, - ) - print(losses.get("nn_labeller", 0.0), losses["ner"]) - - # test the trained model - for example_dict in TRAIN_DATA: - if "text" in example_dict: - doc = nlp(example_dict["text"]) - print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py deleted file mode 100644 index a7eb120c9..000000000 --- a/examples/training/rehearsal.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Prevent catastrophic forgetting with rehearsal updates.""" -import plac -import random -import warnings -import srsly -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - -# TODO: further fix & test this script for v.3 ? (read_gold_data is never called) - -LABEL = "ANIMAL" -TRAIN_DATA = [ - ( - "Horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, "ANIMAL")]}, - ), - ("Do they bite?", {"entities": []}), - ( - "horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, "ANIMAL")]}, - ), - ("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}), - ( - "they pretend to care about your feelings, those horses", - {"entities": [(48, 54, "ANIMAL")]}, - ), - ("horses?", {"entities": [(0, 6, "ANIMAL")]}), -] - - -def read_raw_data(nlp, jsonl_loc): - for json_obj in srsly.read_jsonl(jsonl_loc): - if json_obj["text"].strip(): - doc = nlp.make_doc(json_obj["text"]) - yield Example.from_dict(doc, {}) - - -def read_gold_data(nlp, gold_loc): - examples = [] - for json_obj in srsly.read_jsonl(gold_loc): - doc = nlp.make_doc(json_obj["text"]) - ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]] - example = Example.from_dict(doc, {"entities": ents}) - examples.append(example) - return examples - - -def main(model_name, unlabelled_loc): - n_iter = 10 - dropout = 0.2 - batch_size = 4 - nlp = spacy.load(model_name) - nlp.get_pipe("ner").add_label(LABEL) - raw_examples = list(read_raw_data(nlp, unlabelled_loc)) - optimizer = nlp.resume_training() - # Avoid use of Adam when resuming training. I don't understand this well - # yet, but I'm getting weird results from Adam. Try commenting out the - # nlp.update(), and using Adam -- you'll find the models drift apart. - # I guess Adam is losing precision, introducing gradient noise? - optimizer.learn_rate = 0.1 - optimizer.b1 = 0.0 - optimizer.b2 = 0.0 - sizes = compounding(1.0, 4.0, 1.001) - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - - with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): - # show warnings for misaligned entity spans once - warnings.filterwarnings("once", category=UserWarning, module="spacy") - - for itn in range(n_iter): - random.shuffle(train_examples) - random.shuffle(raw_examples) - losses = {} - r_losses = {} - # batch up the examples using spaCy's minibatch - raw_batches = minibatch(raw_examples, size=4) - for batch in minibatch(train_examples, size=sizes): - nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses) - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) - print("Losses", losses) - print("R. Losses", r_losses) - print(nlp.get_pipe("ner").model.unseen_classes) - test_text = "Do you like horses?" - doc = nlp(test_text) - print("Entities in '%s'" % test_text) - for ent in doc.ents: - print(ent.label_, ent.text) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py deleted file mode 100644 index d2bd61e5b..000000000 --- a/examples/training/train_entity_linker.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 - -"""Example of training spaCy's entity linker, starting off with a predefined -knowledge base and corresponding vocab, and a blank English model. - -For more details, see the documentation: -* Training: https://spacy.io/usage/training -* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking - -Compatible with: spaCy v2.2.4 -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy - -from spacy.gold import Example -from spacy.pipeline import EntityRuler -from spacy.util import minibatch, compounding - - -def sample_train_data(): - train_data = [] - - # Q2146908 (Russ Cochran): American golfer - # Q7381115 (Russ Cochran): publisher - - text_1 = "Russ Cochran his reprints include EC Comics." - dict_1 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}} - train_data.append((text_1, {"links": dict_1})) - - text_2 = "Russ Cochran has been publishing comic art." - dict_2 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}} - train_data.append((text_2, {"links": dict_2})) - - text_3 = "Russ Cochran captured his first major title with his son as caddie." - dict_3 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}} - train_data.append((text_3, {"links": dict_3})) - - text_4 = "Russ Cochran was a member of University of Kentucky's golf team." - dict_4 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}} - train_data.append((text_4, {"links": dict_4})) - - return train_data - - -# training data -TRAIN_DATA = sample_train_data() - - -@plac.annotations( - kb_path=("Path to the knowledge base", "positional", None, Path), - vocab_path=("Path to the vocab for the kb", "positional", None, Path), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(kb_path, vocab_path, output_dir=None, n_iter=50): - """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. - The `vocab` should be the one used during creation of the KB.""" - # create blank English model with correct vocab - nlp = spacy.blank("en") - nlp.vocab.from_disk(vocab_path) - nlp.vocab.vectors.name = "spacy_pretrained_vectors" - print("Created blank 'en' model with vocab from '%s'" % vocab_path) - - # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. - nlp.add_pipe(nlp.create_pipe("sentencizer")) - - # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. - # Note that in a realistic application, an actual NER algorithm should be used instead. - ruler = EntityRuler(nlp) - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} - ] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - # Create the Entity Linker component and add it to the pipeline. - if "entity_linker" not in nlp.pipe_names: - print("Loading Knowledge Base from '%s'" % kb_path) - cfg = { - "kb_loader": { - "@assets": "spacy.KBFromFile.v1", - "vocab_path": vocab_path, - "kb_path": kb_path, - }, - # use only the predicted EL score and not the prior probability (for demo purposes) - "incl_prior": False, - } - entity_linker = nlp.create_pipe("entity_linker", cfg) - nlp.add_pipe(entity_linker, last=True) - - # Convert the texts to docs to make sure we have doc.ents set for the training examples. - # Also ensure that the annotated examples correspond to known identifiers in the knowledge base. - kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() - train_examples = [] - for text, annotation in TRAIN_DATA: - with nlp.select_pipes(disable="entity_linker"): - doc = nlp(text) - annotation_clean = annotation - for offset, kb_id_dict in annotation["links"].items(): - new_dict = {} - for kb_id, value in kb_id_dict.items(): - if kb_id in kb_ids: - new_dict[kb_id] = value - else: - print( - "Removed", kb_id, "from training because it is not in the KB." - ) - annotation_clean["links"][offset] = new_dict - train_examples.append(Example.from_dict(doc, annotation_clean)) - - with nlp.select_pipes(enable="entity_linker"): # only train entity linker - # reset and initialize the weights randomly - optimizer = nlp.begin_training() - - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update( - batch, - drop=0.2, # dropout - make it harder to memorise data - losses=losses, - sgd=optimizer, - ) - print(itn, "Losses", losses) - - # test the trained model - _apply_model(nlp) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print() - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - _apply_model(nlp2) - - -def _apply_model(nlp): - for text, annotation in TRAIN_DATA: - # apply the entity linker which will now make predictions for the 'Russ Cochran' entities - doc = nlp(text) - print() - print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output (can be shuffled): - - # Entities[('Russ Cochran', 'PERSON', 'Q7381115')] - # Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ("his", '', ''), ('reprints', '', ''), ('include', '', ''), ('The', '', ''), ('Complete', '', ''), ('EC', '', ''), ('Library', '', ''), ('.', '', '')] - - # Entities[('Russ Cochran', 'PERSON', 'Q7381115')] - # Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ('has', '', ''), ('been', '', ''), ('publishing', '', ''), ('comic', '', ''), ('art', '', ''), ('.', '', '')] - - # Entities[('Russ Cochran', 'PERSON', 'Q2146908')] - # Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('captured', '', ''), ('his', '', ''), ('first', '', ''), ('major', '', ''), ('title', '', ''), ('with', '', ''), ('his', '', ''), ('son', '', ''), ('as', '', ''), ('caddie', '', ''), ('.', '', '')] - - # Entities[('Russ Cochran', 'PERSON', 'Q2146908')] - # Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('was', '', ''), ('a', '', ''), ('member', '', ''), ('of', '', ''), ('University', '', ''), ('of', '', ''), ('Kentucky', '', ''), ("'s", '', ''), ('golf', '', ''), ('team', '', ''), ('.', '', '')] diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py deleted file mode 100644 index fffa140f4..000000000 --- a/examples/training/train_intent_parser.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -"""Using the parser to recognise your own semantics - -spaCy's parser component can be trained to predict any type of tree -structure over your input text. You can also predict trees over whole documents -or chat logs, with connections between the sentence-roots used to annotate -discourse structure. In this example, we'll build a message parser for a common -"chat intent": finding local businesses. Our message semantics will have the -following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. - -"show me the best hotel in berlin" -('show', 'ROOT', 'show') -('best', 'QUALITY', 'hotel') --> hotel with QUALITY best -('hotel', 'PLACE', 'show') --> show PLACE hotel -('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin - -Compatible with: spaCy v2.0.0+ -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# training data: texts, heads and dependency labels -# for no relation, we simply chose an arbitrary dependency label, e.g. '-' -TRAIN_DATA = [ - ( - "find a cafe with great wifi", - { - "heads": [0, 2, 0, 5, 5, 2], # index of token head - "deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"], - }, - ), - ( - "find a hotel near the beach", - { - "heads": [0, 2, 0, 5, 5, 2], - "deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"], - }, - ), - ( - "find me the closest gym that's open late", - { - "heads": [0, 0, 4, 4, 0, 6, 4, 6, 6], - "deps": [ - "ROOT", - "-", - "-", - "QUALITY", - "PLACE", - "-", - "-", - "ATTRIBUTE", - "TIME", - ], - }, - ), - ( - "show me the cheapest store that sells flowers", - { - "heads": [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! - "deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"], - }, - ), - ( - "find a nice restaurant in london", - { - "heads": [0, 3, 3, 0, 3, 3], - "deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"], - }, - ), - ( - "show me the coolest hostel in berlin", - { - "heads": [0, 0, 4, 4, 0, 4, 4], - "deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"], - }, - ), - ( - "find a good italian restaurant near work", - { - "heads": [0, 4, 4, 4, 0, 4, 5], - "deps": [ - "ROOT", - "-", - "QUALITY", - "ATTRIBUTE", - "PLACE", - "ATTRIBUTE", - "LOCATION", - ], - }, - ), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, output_dir=None, n_iter=15): - """Load the model, set up the pipeline and train the parser.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - - # We'll use the built-in dependency parser class, but we want to create a - # fresh instance – just in case. - if "parser" in nlp.pipe_names: - nlp.remove_pipe("parser") - parser = nlp.create_pipe("parser") - nlp.add_pipe(parser, first=True) - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for dep in annotations.get("deps", []): - parser.add_label(dep) - - with nlp.select_pipes(enable="parser"): # only train parser - optimizer = nlp.begin_training() - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_model(nlp) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - test_model(nlp2) - - -def test_model(nlp): - texts = [ - "find a hotel with good wifi", - "find me the cheapest gym near work", - "show me the best hotel in berlin", - ] - docs = nlp.pipe(texts) - for doc in docs: - print(doc.text) - print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # find a hotel with good wifi - # [ - # ('find', 'ROOT', 'find'), - # ('hotel', 'PLACE', 'find'), - # ('good', 'QUALITY', 'wifi'), - # ('wifi', 'ATTRIBUTE', 'hotel') - # ] - # find me the cheapest gym near work - # [ - # ('find', 'ROOT', 'find'), - # ('cheapest', 'QUALITY', 'gym'), - # ('gym', 'PLACE', 'find'), - # ('near', 'ATTRIBUTE', 'gym'), - # ('work', 'LOCATION', 'near') - # ] - # show me the best hotel in berlin - # [ - # ('show', 'ROOT', 'show'), - # ('best', 'QUALITY', 'hotel'), - # ('hotel', 'PLACE', 'show'), - # ('berlin', 'LOCATION', 'hotel') - # ] diff --git a/examples/training/train_morphologizer.py b/examples/training/train_morphologizer.py deleted file mode 100644 index 8c39a28a6..000000000 --- a/examples/training/train_morphologizer.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -""" -A simple example for training a morphologizer. For more details, see -the documentation: -* Training: https://spacy.io/usage/training - -Compatible with: spaCy v3.0.0+ -Last tested with: v3.0.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding -from spacy.morphology import Morphology - - -# Usually you'll read this in, of course. Data formats vary. Ensure your -# strings are unicode and that the number of tags assigned matches spaCy's -# tokenization. If not, you can always add a 'words' key to the annotations -# that specifies the gold-standard tokenization, e.g.: -# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']}) -TRAIN_DATA = [ - ( - "I like green eggs", - { - "morphs": [ - "PronType=Prs|Person=1", - "VerbForm=Fin", - "Degree=Pos", - "Number=Plur", - ], - "pos": ["PRON", "VERB", "ADJ", "NOUN"], - }, - ), - ( - "Eat blue ham", - { - "morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"], - "pos": ["VERB", "ADJ", "NOUN"], - }, - ), - ( - "She was blue", - { - "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"], - "pos": ["PRON", "VERB", "ADJ"], - }, - ), - ( - "He was blue today", - { - "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""], - "pos": ["PRON", "VERB", "ADJ", "ADV"], - }, - ), -] - -# The POS tags are optional, set `with_pos_tags = False` to omit them for -# this example: -with_pos_tags = True - -if not with_pos_tags: - for i in range(len(TRAIN_DATA)): - del TRAIN_DATA[i][1]["pos"] - - -@plac.annotations( - lang=("ISO Code of language to use", "option", "l", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(lang="en", output_dir=None, n_iter=25): - """Create a new model, set up the pipeline and train the tagger. In order to - train the tagger with a custom tag map, we're creating a new Language - instance with a custom vocab. - """ - nlp = spacy.blank(lang) - # add the tagger to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - morphologizer = nlp.create_pipe("morphologizer") - nlp.add_pipe(morphologizer) - - # add labels and create the Example instances - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - morph_labels = annotations.get("morphs") - pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs"))) - assert len(morph_labels) == len(pos_labels) - for morph, pos in zip(morph_labels, pos_labels): - morph_dict = Morphology.feats_to_dict(morph) - if pos: - morph_dict["POS"] = pos - morph = Morphology.dict_to_feats(morph_dict) - morphologizer.add_label(morph) - - optimizer = nlp.begin_training() - for i in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "I like blue eggs" - doc = nlp(test_text) - print("Morphs", [(t.text, t.morph) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the save model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc = nlp2(test_text) - print("Morphs", [(t.text, t.morph) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - -# Expected output: -# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)] diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py deleted file mode 100644 index 26b283777..000000000 --- a/examples/training/train_ner.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of training spaCy's named entity recognizer, starting off with an -existing model or a blank model. - -For more details, see the documentation: -* Training: https://spacy.io/usage/training -* NER: https://spacy.io/usage/linguistic-features#named-entities - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -import warnings -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# training data -TRAIN_DATA = [ - ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), - ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, output_dir=None, n_iter=100): - """Load the model, set up the pipeline and train the entity recognizer.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - - # create the built-in pipeline components and add them to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - if "simple_ner" not in nlp.pipe_names: - ner = nlp.create_pipe("simple_ner") - nlp.add_pipe(ner, last=True) - # otherwise, get it so we can add labels - else: - ner = nlp.get_pipe("simple_ner") - - # add labels and create Example objects - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for ent in annotations.get("entities"): - print("Add label", ent[2]) - ner.add_label(ent[2]) - - with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings(): - # show warnings for misaligned entity spans once - warnings.filterwarnings("once", category=UserWarning, module="spacy") - - # reset and initialize the weights randomly – but only if we're - # training a new model - if model is None: - nlp.begin_training() - print( - "Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names())) - ) - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update( - batch, - drop=0.0, # dropout - make it harder to memorise data - losses=losses, - ) - print("Losses", losses) - - # test the trained model - for text, _ in TRAIN_DATA: - doc = nlp(text) - print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - for text, _ in TRAIN_DATA: - doc = nlp2(text) - print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # Entities [('Shaka Khan', 'PERSON')] - # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), - # ('Khan', 'PERSON', 1), ('?', '', 2)] - # Entities [('London', 'LOC'), ('Berlin', 'LOC')] - # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), - # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)] diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py deleted file mode 100644 index c4edafac4..000000000 --- a/examples/training/train_new_entity_type.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of training an additional entity type - -This script shows how to add a new entity type to an existing pretrained NER -model. To keep the example short and simple, only four sentences are provided -as examples. In practice, you'll need many more — a few hundred would be a -good start. You will also likely need to mix in examples of other entity -types, which might be obtained by running the entity recognizer over unlabelled -sentences, and adding their annotations to the training set. - -The actual training is performed by looping over the examples, and calling -`nlp.entity.update()`. The `update()` method steps through the words of the -input. At each word, it makes a prediction. It then consults the annotations -provided on the GoldParse instance, to see whether it was right. If it was -wrong, it adjusts its weights so that the correct action will score higher -next time. - -After training your model, you can save it to a directory. We recommend -wrapping models as Python packages, for ease of deployment. - -For more details, see the documentation: -* Training: https://spacy.io/usage/training -* NER: https://spacy.io/usage/linguistic-features#named-entities - -Compatible with: spaCy v2.1.0+ -Last tested with: v2.2.4 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -import warnings -from pathlib import Path -import spacy -from spacy.util import minibatch, compounding - - -# new entity label -LABEL = "ANIMAL" - -# training data -# Note: If you're using an existing model, make sure to mix in examples of -# other entity types that spaCy correctly recognized before. Otherwise, your -# model might learn the new type, but "forget" what it previously knew. -# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting -TRAIN_DATA = [ - ( - "Horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, LABEL)]}, - ), - ("Do they bite?", {"entities": []}), - ( - "horses are too tall and they pretend to care about your feelings", - {"entities": [(0, 6, LABEL)]}, - ), - ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}), - ( - "they pretend to care about your feelings, those horses", - {"entities": [(48, 54, LABEL)]}, - ), - ("horses?", {"entities": [(0, 6, LABEL)]}), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - new_model_name=("New model name for model meta.", "option", "nm", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): - """Set up the pipeline and entity recognizer, and train the new entity.""" - random.seed(0) - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - # Add entity recognizer to model if it's not in the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - train_examples = [] - for text, annotation in TRAIN_DATA: - train_examples.append(TRAIN_DATA.from_dict(nlp(text), annotation)) - - if "ner" not in nlp.pipe_names: - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - # otherwise, get it, so we can add labels to it - else: - ner = nlp.get_pipe("ner") - - ner.add_label(LABEL) # add new entity label to entity recognizer - # Adding extraneous labels shouldn't mess anything up - ner.add_label("VEGETABLE") - if model is None: - optimizer = nlp.begin_training() - else: - optimizer = nlp.resume_training() - move_names = list(ner.move_names) - with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): - # show warnings for misaligned entity spans once - warnings.filterwarnings("once", category=UserWarning, module="spacy") - - sizes = compounding(1.0, 4.0, 1.001) - # batch up the examples using spaCy's minibatch - for itn in range(n_iter): - random.shuffle(train_examples) - batches = minibatch(train_examples, size=sizes) - losses = {} - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "Do you like horses?" - doc = nlp(test_text) - print("Entities in '%s'" % test_text) - for ent in doc.ents: - print(ent.label_, ent.text) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.meta["name"] = new_model_name # rename model - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - # Check the classes have loaded back consistently - assert nlp2.get_pipe("ner").move_names == move_names - doc2 = nlp2(test_text) - for ent in doc2.ents: - print(ent.label_, ent.text) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py deleted file mode 100644 index d46a8f4b9..000000000 --- a/examples/training/train_parser.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Example of training spaCy dependency parser, starting off with an existing -model or a blank model. For more details, see the documentation: -* Training: https://spacy.io/usage/training -* Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# training data -TRAIN_DATA = [ - ( - "They trade mortgage-backed securities.", - { - "heads": [1, 1, 4, 4, 5, 1, 1], - "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], - }, - ), - ( - "I like London and Berlin.", - { - "heads": [1, 1, 1, 2, 2, 1], - "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], - }, - ), -] - - -@plac.annotations( - model=("Model name. Defaults to blank 'en' model.", "option", "m", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(model=None, output_dir=None, n_iter=15): - """Load the model, set up the pipeline and train the parser.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") - - # add the parser to the pipeline if it doesn't exist - # nlp.create_pipe works for built-ins that are registered with spaCy - if "parser" not in nlp.pipe_names: - parser = nlp.create_pipe("parser") - nlp.add_pipe(parser, first=True) - # otherwise, get it, so we can add labels to it - else: - parser = nlp.get_pipe("parser") - - # add labels to the parser and create the Example objects - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for dep in annotations.get("deps", []): - parser.add_label(dep) - - with nlp.select_pipes(enable="parser"): # only train parser - optimizer = nlp.begin_training() - for itn in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "I like securities." - doc = nlp(test_text) - print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc = nlp2(test_text) - print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # expected result: - # [ - # ('I', 'nsubj', 'like'), - # ('like', 'ROOT', 'like'), - # ('securities', 'dobj', 'like'), - # ('.', 'punct', 'like') - # ] diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py deleted file mode 100644 index 4eeb77fb9..000000000 --- a/examples/training/train_tagger.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -""" -A simple example for training a part-of-speech tagger with a custom tag map. -To allow us to update the tag map with our custom one, this example starts off -with a blank Language class and modifies its defaults. For more details, see -the documentation: -* Training: https://spacy.io/usage/training -* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging - -Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -import spacy -from spacy.gold import Example -from spacy.util import minibatch, compounding - - -# You need to define a mapping from your data's part-of-speech tag names to the -# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. -# See here for the Universal Tag Set: -# http://universaldependencies.github.io/docs/u/pos/index.html -# You may also specify morphological features for your tags, from the universal -# scheme. -TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} - -# Usually you'll read this in, of course. Data formats vary. Ensure your -# strings are unicode and that the number of tags assigned matches spaCy's -# tokenization. If not, you can always add a 'words' key to the annotations -# that specifies the gold-standard tokenization, e.g.: -# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']}) -TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), -] - - -@plac.annotations( - lang=("ISO Code of language to use", "option", "l", str), - output_dir=("Optional output directory", "option", "o", Path), - n_iter=("Number of training iterations", "option", "n", int), -) -def main(lang="en", output_dir=None, n_iter=25): - """Create a new model, set up the pipeline and train the tagger. In order to - train the tagger with a custom tag map, we're creating a new Language - instance with a custom vocab. - """ - nlp = spacy.blank(lang) - # add the tagger to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy - tagger = nlp.create_pipe("tagger") - # Add the tags. This needs to be done before you start training. - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) - nlp.add_pipe(tagger) - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - - optimizer = nlp.begin_training() - for i in range(n_iter): - random.shuffle(train_examples) - losses = {} - # batch up the examples using spaCy's minibatch - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - print("Losses", losses) - - # test the trained model - test_text = "I like blue eggs" - doc = nlp(test_text) - print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) - - # save model to output directory - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the save model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc = nlp2(test_text) - print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) - - -if __name__ == "__main__": - plac.call(main) - - # Expected output: - # [ - # ('I', 'N', 'NOUN'), - # ('like', 'V', 'VERB'), - # ('blue', 'J', 'ADJ'), - # ('eggs', 'N', 'NOUN') - # ] diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py deleted file mode 100644 index 901b382bf..000000000 --- a/examples/training/train_textcat.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Train a convolutional neural network text classifier on the -IMDB dataset, using the TextCategorizer component. The dataset will be loaded -automatically via the package `ml_datasets`. The model is added to -spacy.pipeline, and predictions are available via `doc.cats`. For more details, -see the documentation: -* Training: https://spacy.io/usage/training - -Compatible with: spaCy v3.0.0+ -""" -from __future__ import unicode_literals, print_function - -import plac -import random -from pathlib import Path -from ml_datasets import loaders - -import spacy -from spacy import util -from spacy.util import minibatch, compounding -from spacy.gold import Example -from thinc.api import Config - - -@plac.annotations( - config_path=("Path to config file", "positional", None, Path), - output_dir=("Optional output directory", "option", "o", Path), - n_texts=("Number of texts to train from", "option", "t", int), - n_iter=("Number of training iterations", "option", "n", int), - init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path), - dataset=("Dataset to train on (default: imdb)", "option", "d", str), - threshold=("Min. number of instances for a given label (default 20)", "option", "m", int) -) -def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20): - if not config_path or not config_path.exists(): - raise ValueError(f"Config file not found at {config_path}") - - spacy.util.fix_random_seed() - if output_dir is not None: - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() - - print(f"Loading nlp model from {config_path}") - nlp_config = Config().from_disk(config_path) - nlp, _ = util.load_model_from_config(nlp_config, auto_fill=True) - - # ensure the nlp object was defined with a textcat component - if "textcat" not in nlp.pipe_names: - raise ValueError(f"The nlp definition in the config does not contain a textcat component") - - textcat = nlp.get_pipe("textcat") - - # load the dataset - print(f"Loading dataset {dataset} ...") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts) - print( - "Using {} examples ({} training, {} evaluation)".format( - n_texts, len(train_texts), len(dev_texts) - ) - ) - train_examples = [] - for text, cats in zip(train_texts, train_cats): - doc = nlp.make_doc(text) - example = Example.from_dict(doc, {"cats": cats}) - for cat in cats: - textcat.add_label(cat) - train_examples.append(example) - - with nlp.select_pipes(enable="textcat"): # only train textcat - optimizer = nlp.begin_training() - if init_tok2vec is not None: - with init_tok2vec.open("rb") as file_: - textcat.model.get_ref("tok2vec").from_bytes(file_.read()) - print("Training the model...") - print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) - batch_sizes = compounding(4.0, 32.0, 1.001) - for i in range(n_iter): - losses = {} - # batch up the examples using spaCy's minibatch - random.shuffle(train_examples) - batches = minibatch(train_examples, size=batch_sizes) - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) - with textcat.model.use_params(optimizer.averages): - # evaluate on the dev data split off in load_data() - scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) - print( - "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table - losses["textcat"], - scores["textcat_p"], - scores["textcat_r"], - scores["textcat_f"], - ) - ) - - # test the trained model (only makes sense for sentiment analysis) - test_text = "This movie sucked" - doc = nlp(test_text) - print(test_text, doc.cats) - - if output_dir is not None: - with nlp.use_params(optimizer.averages): - nlp.to_disk(output_dir) - print("Saved model to", output_dir) - - # test the saved model - print("Loading from", output_dir) - nlp2 = spacy.load(output_dir) - doc2 = nlp2(test_text) - print(test_text, doc2.cats) - - -def load_data(dataset, threshold, limit=0, split=0.8): - """Load data from the provided dataset.""" - # Partition off part of the train data for evaluation - data_loader = loaders.get(dataset) - train_data, _ = data_loader(limit=int(limit/split)) - random.shuffle(train_data) - texts, labels = zip(*train_data) - - unique_labels = set() - for label_set in labels: - if isinstance(label_set, int) or isinstance(label_set, str): - unique_labels.add(label_set) - elif isinstance(label_set, list) or isinstance(label_set, set): - unique_labels.update(label_set) - unique_labels = sorted(unique_labels) - print(f"# of unique_labels: {len(unique_labels)}") - - count_values_train = dict() - for text, annot_list in train_data: - if isinstance(annot_list, int) or isinstance(annot_list, str): - count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1 - else: - for annot in annot_list: - count_values_train[annot] = count_values_train.get(annot, 0) + 1 - for value, count in sorted(count_values_train.items(), key=lambda item: item[1]): - if count < threshold: - unique_labels.remove(value) - - print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}") - - if unique_labels == {0, 1}: - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] - else: - cats = [] - for y in labels: - if isinstance(y, str) or isinstance(y, int): - cats.append({str(label): (label == y) for label in unique_labels}) - elif isinstance(y, set): - cats.append({str(label): (label in y) for label in unique_labels}) - else: - raise ValueError(f"Unrecognised type of labels: {type(y)}") - - split = int(len(train_data) * split) - return (texts[:split], cats[:split]), (texts[split:], cats[split:]) - - -def evaluate(tokenizer, textcat, texts, cats): - docs = (tokenizer(text) for text in texts) - tp = 0.0 # True positives - fp = 1e-8 # False positives - fn = 1e-8 # False negatives - tn = 0.0 # True negatives - for i, doc in enumerate(textcat.pipe(docs)): - gold = cats[i] - for label, score in doc.cats.items(): - if label not in gold: - continue - if label == "NEGATIVE": - continue - if score >= 0.5 and gold[label] >= 0.5: - tp += 1.0 - elif score >= 0.5 and gold[label] < 0.5: - fp += 1.0 - elif score < 0.5 and gold[label] < 0.5: - tn += 1 - elif score < 0.5 and gold[label] >= 0.5: - fn += 1 - precision = tp / (tp + fp) - recall = tp / (tp + fn) - if (precision + recall) == 0: - f_score = 0.0 - else: - f_score = 2 * (precision * recall) / (precision + recall) - return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/train_textcat_config.cfg b/examples/training/train_textcat_config.cfg deleted file mode 100644 index a1f4e91ce..000000000 --- a/examples/training/train_textcat_config.cfg +++ /dev/null @@ -1,14 +0,0 @@ -[nlp] -lang = "en" -pipeline = ["textcat"] - -[components] - -[components.textcat] -factory = "textcat" - -[components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" -exclusive_classes = true -ngram_size = 1 -no_output_layer = false diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py deleted file mode 100644 index 9b34811f7..000000000 --- a/examples/vectors_fast_text.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Load vectors for a language trained using fastText -https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md -Compatible with: spaCy v2.0.0+ -""" -from __future__ import unicode_literals -import plac -import numpy - -import spacy -from spacy.language import Language - - -@plac.annotations( - vectors_loc=("Path to .vec file", "positional", None, str), - lang=( - "Optional language ID. If not set, blank Language() will be used.", - "positional", - None, - str, - ), -) -def main(vectors_loc, lang=None): - if lang is None: - nlp = Language() - else: - # create empty language class – this is required if you're planning to - # save the model to disk and load it back later (models always need a - # "lang" setting). Use 'xx' for blank multi-language class. - nlp = spacy.blank(lang) - with open(vectors_loc, "rb") as file_: - header = file_.readline() - nr_row, nr_dim = header.split() - nlp.vocab.reset_vectors(width=int(nr_dim)) - for line in file_: - line = line.rstrip().decode("utf8") - pieces = line.rsplit(" ", int(nr_dim)) - word = pieces[0] - vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f") - nlp.vocab.set_vector(word, vector) # add the vectors to the vocab - # test the vectors and similarity - text = "class colspan" - doc = nlp(text) - print(text, doc[0].similarity(doc[1])) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/vectors_tensorboard.py b/examples/vectors_tensorboard.py deleted file mode 100644 index 72eda1edc..000000000 --- a/examples/vectors_tensorboard.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# coding: utf8 -"""Visualize spaCy word vectors in Tensorboard. - -Adapted from: https://gist.github.com/BrikerMan/7bd4e4bd0a00ac9076986148afc06507 -""" -from __future__ import unicode_literals - -from os import path - -import tqdm -import math -import numpy -import plac -import spacy -import tensorflow as tf -from tensorflow.contrib.tensorboard.plugins.projector import ( - visualize_embeddings, - ProjectorConfig, -) - - -@plac.annotations( - vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str), - out_loc=( - "Path to output folder for tensorboard session data", - "positional", - None, - str, - ), - name=( - "Human readable name for tsv file and vectors tensor", - "positional", - None, - str, - ), -) -def main(vectors_loc, out_loc, name="spaCy_vectors"): - meta_file = "{}.tsv".format(name) - out_meta_file = path.join(out_loc, meta_file) - - print("Loading spaCy vectors model: {}".format(vectors_loc)) - model = spacy.load(vectors_loc) - print("Finding lexemes with vectors attached: {}".format(vectors_loc)) - strings_stream = tqdm.tqdm( - model.vocab.strings, total=len(model.vocab.strings), leave=False - ) - queries = [w for w in strings_stream if model.vocab.has_vector(w)] - vector_count = len(queries) - - print( - "Building Tensorboard Projector metadata for ({}) vectors: {}".format( - vector_count, out_meta_file - ) - ) - - # Store vector data in a tensorflow variable - tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1])) - - # Write a tab-separated file that contains information about the vectors for visualization - # - # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata - with open(out_meta_file, "wb") as file_metadata: - # Define columns in the first row - file_metadata.write("Text\tFrequency\n".encode("utf-8")) - # Write out a row for each vector that we add to the tensorflow variable we created - vec_index = 0 - for text in tqdm.tqdm(queries, total=len(queries), leave=False): - # https://github.com/tensorflow/tensorflow/issues/9094 - text = "" if text.lstrip() == "" else text - lex = model.vocab[text] - - # Store vector data and metadata - tf_vectors_variable[vec_index] = model.vocab.get_vector(text) - file_metadata.write( - "{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode( - "utf-8" - ) - ) - vec_index += 1 - - print("Running Tensorflow Session...") - sess = tf.InteractiveSession() - tf.Variable(tf_vectors_variable, trainable=False, name=name) - tf.global_variables_initializer().run() - saver = tf.train.Saver() - writer = tf.summary.FileWriter(out_loc, sess.graph) - - # Link the embeddings into the config - config = ProjectorConfig() - embed = config.embeddings.add() - embed.tensor_name = name - embed.metadata_path = meta_file - - # Tell the projector about the configured embeddings and metadata file - visualize_embeddings(writer, config) - - # Save session and print run command to the output - print("Saving Tensorboard Session...") - saver.save(sess, path.join(out_loc, "{}.ckpt".format(name))) - print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc)) - - -if __name__ == "__main__": - plac.call(main) diff --git a/examples/training/ner_example_data/README.md b/extra/example_data/ner_example_data/README.md similarity index 100% rename from examples/training/ner_example_data/README.md rename to extra/example_data/ner_example_data/README.md diff --git a/examples/training/ner_example_data/ner-sent-per-line.iob b/extra/example_data/ner_example_data/ner-sent-per-line.iob similarity index 100% rename from examples/training/ner_example_data/ner-sent-per-line.iob rename to extra/example_data/ner_example_data/ner-sent-per-line.iob diff --git a/examples/training/ner_example_data/ner-sent-per-line.json b/extra/example_data/ner_example_data/ner-sent-per-line.json similarity index 100% rename from examples/training/ner_example_data/ner-sent-per-line.json rename to extra/example_data/ner_example_data/ner-sent-per-line.json diff --git a/examples/training/ner_example_data/ner-token-per-line-conll2003.iob b/extra/example_data/ner_example_data/ner-token-per-line-conll2003.iob similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-conll2003.iob rename to extra/example_data/ner_example_data/ner-token-per-line-conll2003.iob diff --git a/examples/training/ner_example_data/ner-token-per-line-conll2003.json b/extra/example_data/ner_example_data/ner-token-per-line-conll2003.json similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-conll2003.json rename to extra/example_data/ner_example_data/ner-token-per-line-conll2003.json diff --git a/examples/training/ner_example_data/ner-token-per-line-with-pos.iob b/extra/example_data/ner_example_data/ner-token-per-line-with-pos.iob similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-with-pos.iob rename to extra/example_data/ner_example_data/ner-token-per-line-with-pos.iob diff --git a/examples/training/ner_example_data/ner-token-per-line-with-pos.json b/extra/example_data/ner_example_data/ner-token-per-line-with-pos.json similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line-with-pos.json rename to extra/example_data/ner_example_data/ner-token-per-line-with-pos.json diff --git a/examples/training/ner_example_data/ner-token-per-line.iob b/extra/example_data/ner_example_data/ner-token-per-line.iob similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line.iob rename to extra/example_data/ner_example_data/ner-token-per-line.iob diff --git a/examples/training/ner_example_data/ner-token-per-line.json b/extra/example_data/ner_example_data/ner-token-per-line.json similarity index 100% rename from examples/training/ner_example_data/ner-token-per-line.json rename to extra/example_data/ner_example_data/ner-token-per-line.json diff --git a/examples/training/textcat_example_data/CC0.txt b/extra/example_data/textcat_example_data/CC0.txt similarity index 100% rename from examples/training/textcat_example_data/CC0.txt rename to extra/example_data/textcat_example_data/CC0.txt diff --git a/examples/training/textcat_example_data/CC_BY-SA-3.0.txt b/extra/example_data/textcat_example_data/CC_BY-SA-3.0.txt similarity index 100% rename from examples/training/textcat_example_data/CC_BY-SA-3.0.txt rename to extra/example_data/textcat_example_data/CC_BY-SA-3.0.txt diff --git a/examples/training/textcat_example_data/CC_BY-SA-4.0.txt b/extra/example_data/textcat_example_data/CC_BY-SA-4.0.txt similarity index 100% rename from examples/training/textcat_example_data/CC_BY-SA-4.0.txt rename to extra/example_data/textcat_example_data/CC_BY-SA-4.0.txt diff --git a/examples/training/textcat_example_data/README.md b/extra/example_data/textcat_example_data/README.md similarity index 100% rename from examples/training/textcat_example_data/README.md rename to extra/example_data/textcat_example_data/README.md diff --git a/examples/training/textcat_example_data/cooking.json b/extra/example_data/textcat_example_data/cooking.json similarity index 100% rename from examples/training/textcat_example_data/cooking.json rename to extra/example_data/textcat_example_data/cooking.json diff --git a/examples/training/textcat_example_data/cooking.jsonl b/extra/example_data/textcat_example_data/cooking.jsonl similarity index 100% rename from examples/training/textcat_example_data/cooking.jsonl rename to extra/example_data/textcat_example_data/cooking.jsonl diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.json b/extra/example_data/textcat_example_data/jigsaw-toxic-comment.json similarity index 100% rename from examples/training/textcat_example_data/jigsaw-toxic-comment.json rename to extra/example_data/textcat_example_data/jigsaw-toxic-comment.json diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl b/extra/example_data/textcat_example_data/jigsaw-toxic-comment.jsonl similarity index 100% rename from examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl rename to extra/example_data/textcat_example_data/jigsaw-toxic-comment.jsonl diff --git a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py similarity index 100% rename from examples/training/textcat_example_data/textcatjsonl_to_trainjson.py rename to extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py diff --git a/examples/training/training-data.json b/extra/example_data/training-data.json similarity index 100% rename from examples/training/training-data.json rename to extra/example_data/training-data.json diff --git a/examples/training/vocab-data.jsonl b/extra/example_data/vocab-data.jsonl similarity index 100% rename from examples/training/vocab-data.jsonl rename to extra/example_data/vocab-data.jsonl diff --git a/examples/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg similarity index 100% rename from examples/experiments/onto-joint/defaults.cfg rename to extra/experiments/onto-joint/defaults.cfg diff --git a/examples/experiments/onto-joint/pretrain.cfg b/extra/experiments/onto-joint/pretrain.cfg similarity index 100% rename from examples/experiments/onto-joint/pretrain.cfg rename to extra/experiments/onto-joint/pretrain.cfg diff --git a/examples/experiments/onto-ner.cfg b/extra/experiments/onto-ner.cfg similarity index 100% rename from examples/experiments/onto-ner.cfg rename to extra/experiments/onto-ner.cfg diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg similarity index 100% rename from examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg rename to extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg similarity index 100% rename from examples/experiments/ptb-joint-pos-dep/defaults.cfg rename to extra/experiments/ptb-joint-pos-dep/defaults.cfg diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg similarity index 100% rename from examples/experiments/tok2vec-ner/charembed_tok2vec.cfg rename to extra/experiments/tok2vec-ner/charembed_tok2vec.cfg diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg similarity index 100% rename from examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg rename to extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg