From b4e0d2bf50fe6c654886eccb0395e47ccfbc3bef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Feb 2020 20:59:10 +0100 Subject: [PATCH 01/46] Improve Makefile (#5067) * Improve pex making * Update gitignore --- .gitignore | 2 ++ Makefile | 42 +++++++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index c4ad59fc7..828258603 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ __pycache__/ .~env/ .venv venv/ +env3.*/ .dev .denv .pypyenv @@ -56,6 +57,7 @@ lib64/ parts/ sdist/ var/ +wheelhouse/ *.egg-info/ pip-wheel-metadata/ Pipfile.lock diff --git a/Makefile b/Makefile index 5d15bccec..1be1c9794 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,36 @@ SHELL := /bin/bash -sha = $(shell "git" "rev-parse" "--short" "HEAD") +WHEELHOUSE := ./wheelhouse +PYVER := 3.6 +VENV := ./env$(PYVER) + version = $(shell "bin/get-version.sh") -wheel = spacy-$(version)-cp36-cp36m-linux_x86_64.whl -dist/spacy.pex : dist/spacy-$(sha).pex - cp dist/spacy-$(sha).pex dist/spacy.pex - chmod a+rx dist/spacy.pex +dist/spacy-$(version).pex : wheelhouse/spacy-$(version)-*.whl + pex -f ./wheelhouse --no-index --disable-cache -m spacy -o dist/spacy-$(version).pex spacy==$(version) jsonschema + chmod a+rx dist/spacy-$(version).pex -dist/spacy-$(sha).pex : dist/$(wheel) - env3.6/bin/python -m pip install pex==1.5.3 - env3.6/bin/pex pytest dist/$(wheel) spacy_lookups_data -e spacy -o dist/spacy-$(sha).pex +dist/pytest.pex : wheelhouse/pytest-*.whl + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o dist/pytest.pex pytest pytest-timeout mock + chmod a+rx dist/pytest.pex -dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py* - python3.6 -m venv env3.6 - source env3.6/bin/activate - env3.6/bin/pip install wheel - env3.6/bin/pip install -r requirements.txt --no-cache-dir - env3.6/bin/python setup.py build_ext --inplace - env3.6/bin/python setup.py sdist - env3.6/bin/python setup.py bdist_wheel +wheelhouse/spacy-$(version)-%.whl : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* + $(VENV)/bin/pip wheel . -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse -.PHONY : clean +wheelhouse/pytest-%.whl : $(VENV)/bin/pex + $(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse + +$(VENV) : + python$(PYVER) -m venv $(VENV) + $(VENV)/bin/python -m pip install pex wheel + +.PHONY : clean test + +test : dist/spacy-$(version).pex dist/pytest.pex + PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x clean : setup.py source env3.6/bin/activate rm -rf dist/* + rm -rf ./wheelhouse python setup.py clean --all From 65d7bab10f540d3acd09da9c1cece5a166670a21 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 27 Feb 2020 18:43:00 +0100 Subject: [PATCH 02/46] Initialize all values in a2b/b2a in new align (#5063) --- spacy/gold.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 3884e1cba..07fd3bdd0 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -151,6 +151,8 @@ def align(tokens_a, tokens_b): cost = 0 a2b = numpy.empty(len(tokens_a), dtype="i") b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) a2b_multi = {} b2a_multi = {} i = 0 @@ -160,7 +162,6 @@ def align(tokens_a, tokens_b): while i < len(tokens_a) and j < len(tokens_b): a = tokens_a[i][offset_a:] b = tokens_b[j][offset_b:] - a2b[i] = b2a[j] = -1 if a == b: if offset_a == offset_b == 0: a2b[i] = j From c6b12ab02adcdfe760bc10e249924553cb826410 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 2 Mar 2020 11:49:28 +0100 Subject: [PATCH 03/46] Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method --- spacy/errors.py | 4 ++ spacy/pipeline/pipes.pyx | 2 +- spacy/tests/doc/test_doc_api.py | 9 +-- spacy/tests/doc/test_token_api.py | 2 +- spacy/tests/parser/test_parse_navigate.py | 32 +++++----- spacy/tests/regression/test_issue2001-2500.py | 2 +- spacy/tests/regression/test_issue2501-3000.py | 2 +- spacy/tests/regression/test_issue4590.py | 2 +- spacy/tests/regression/test_issue5048.py | 35 +++++++++++ spacy/tests/test_displacy.py | 10 ++-- spacy/tests/util.py | 58 ++++++++++++++----- spacy/tokens/doc.pyx | 4 +- 12 files changed, 115 insertions(+), 47 deletions(-) create mode 100644 spacy/tests/regression/test_issue5048.py diff --git a/spacy/errors.py b/spacy/errors.py index 2f0a8a2ad..5957c5ecd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -107,6 +107,9 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") + W028 = ("Doc.from_array was called with a vector of type '{type}', " + "but is expecting one of type 'uint64' instead. This may result " + "in problems with the vocab further on in the pipeline.") @@ -541,6 +544,7 @@ class Errors(object): E188 = ("Could not match the gold entity links to entities in the doc - " "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") + E189 = ("Each argument to `get_doc` should be of equal length.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 3b190debe..a20c9b6df 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -367,7 +367,7 @@ class Tensorizer(Pipe): return sgd -@component("tagger", assigns=["token.tag", "token.pos"]) +@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"]) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 52f856d3e..19d908529 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer): # Example that caused run-time error while parsing Reddit # fmt: off text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school" - deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", - "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep", - "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", - "ROOT", "amod", "dobj"] + deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det", + "amod", "pobj", "acl", "prep", "prep", "pobj", + "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"] # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) @@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab): def test_doc_from_array_sent_starts(en_vocab): words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] + # fmt: off deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] + # fmt: on doc = Doc(en_vocab, words=words) for i, (dep, head) in enumerate(zip(deps, heads)): doc[i].dep_ = dep diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index bff2a95c6..b7522bb98 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab): def test_token_api_conjuncts_simple(en_vocab): words = "They came and went .".split() heads = [1, 0, -1, -2, -1] - deps = ["nsubj", "ROOT", "cc", "conj"] + deps = ["nsubj", "ROOT", "cc", "conj", "dep"] doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert [w.text for w in doc[1].conjuncts] == ["went"] assert [w.text for w in doc[3].conjuncts] == ["came"] diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index eb206458e..41524d45e 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran. @pytest.fixture def heads(): # fmt: off - return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15, - -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, - -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14, - 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1, - 0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6, - 9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1, - 2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1, - 3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0, - -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1, - -1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1, - -2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1, - 1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2, - 1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2, - -19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, + return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2, + -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, + -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14, + 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1, + 0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10, + 9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1, + 2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1, + 3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0, + -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1, + -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1, + -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1, + 1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2, + 1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2, + -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, 0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1, - 1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1, - -1, -8, -9, -1] + 1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1, + -1, 0, -1, -1] # fmt: on diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index e95c1a9b9..01f0f905c 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -48,7 +48,7 @@ def test_issue2203(en_vocab): tag_ids = [en_vocab.strings.add(tag) for tag in tags] lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] doc = Doc(en_vocab, words=words) - # Work around lemma corrpution problem and set lemmas after tags + # Work around lemma corruption problem and set lemmas after tags doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) assert [t.tag_ for t in doc] == tags diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 73ff7376a..1f5e44499 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -124,7 +124,7 @@ def test_issue2772(en_vocab): words = "When we write or communicate virtually , we can hide our true feelings .".split() # A tree with a non-projective (i.e. crossing) arc # The arcs (0, 4) and (2, 9) cross. - heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1] + heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4] deps = ["dep"] * len(heads) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert doc[1].is_sent_start is None diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 8ec9a0bd1..3d01cd487 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -27,7 +27,7 @@ def test_issue4590(en_vocab): text = "The quick brown fox jumped over the lazy fox" heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] - deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py new file mode 100644 index 000000000..228322493 --- /dev/null +++ b/spacy/tests/regression/test_issue5048.py @@ -0,0 +1,35 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy +from spacy.tokens import Doc +from spacy.attrs import DEP, POS, TAG + +from ..util import get_doc + + +def test_issue5048(en_vocab): + words = ["This", "is", "a", "sentence"] + pos_s = ["DET", "VERB", "DET", "NOUN"] + spaces = [" ", " ", " ", ""] + deps_s = ["dep", "adj", "nn", "atm"] + tags_s = ["DT", "VBZ", "DT", "NN"] + + strings = en_vocab.strings + + for w in words: + strings.add(w) + deps = [strings.add(d) for d in deps_s] + pos = [strings.add(p) for p in pos_s] + tags = [strings.add(t) for t in tags_s] + + attrs = [POS, DEP, TAG] + array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") + + doc = Doc(en_vocab, words=words, spaces=spaces) + doc.from_array(attrs, array) + v1 = [(token.text, token.pos_, token.tag_) for token in doc] + + doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) + v2 = [(token.text, token.pos_, token.tag_) for token in doc2] + assert v1 == v2 diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index d04c0506f..539714e0c 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab): deps = displacy.parse_deps(doc) assert isinstance(deps, dict) assert deps["words"] == [ - {"lemma": None, "text": "This", "tag": "DET"}, - {"lemma": None, "text": "is", "tag": "AUX"}, - {"lemma": None, "text": "a", "tag": "DET"}, - {"lemma": None, "text": "sentence", "tag": "NOUN"}, + {"lemma": None, "text": words[0], "tag": pos[0]}, + {"lemma": None, "text": words[1], "tag": pos[1]}, + {"lemma": None, "text": words[2], "tag": pos[2]}, + {"lemma": None, "text": words[3], "tag": pos[3]}, ] assert deps["arcs"] == [ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, @@ -75,7 +75,7 @@ def test_displacy_rtl(): deps = ["foo", "bar", "foo", "baz"] heads = [1, 0, 1, -2] nlp = Persian() - doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) + doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps) doc.ents = [Span(doc, 1, 3, label="TEST")] html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 9ee5b89f8..52768dd41 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -7,8 +7,10 @@ import shutil import contextlib import srsly from pathlib import Path + +from spacy import Errors from spacy.tokens import Doc, Span -from spacy.attrs import POS, HEAD, DEP +from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA from spacy.compat import path2str @@ -26,30 +28,54 @@ def make_tempdir(): shutil.rmtree(path2str(d)) -def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): +def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): """Create Doc object from given vocab, words and annotations.""" - pos = pos or [""] * len(words) - tags = tags or [""] * len(words) - heads = heads or [0] * len(words) - deps = deps or [""] * len(words) - for value in deps + tags + pos: + if deps and not heads: + heads = [0] * len(deps) + headings = [] + values = [] + annotations = [pos, heads, deps, lemmas, tags] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG] + for a, annot in enumerate(annotations): + if annot is not None: + if len(annot) != len(words): + raise ValueError(Errors.E189) + headings.append(possible_headings[a]) + if annot is not heads: + values.extend(annot) + for value in values: vocab.strings.add(value) doc = Doc(vocab, words=words) - attrs = doc.to_array([POS, HEAD, DEP]) - for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): - attrs[i, 0] = doc.vocab.strings[p] - attrs[i, 1] = head - attrs[i, 2] = doc.vocab.strings[dep] - doc.from_array([POS, HEAD, DEP], attrs) + + # if there are any other annotations, set them + if headings: + attrs = doc.to_array(headings) + + j = 0 + for annot in annotations: + if annot: + if annot is heads: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = heads[i] + else: + attrs[i,j] = heads[i] + else: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = doc.vocab.strings[annot[i]] + else: + attrs[i, j] = doc.vocab.strings[annot[i]] + j += 1 + doc.from_array(headings, attrs) + + # finally, set the entities if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] - if tags: - for token in doc: - token.tag_ = tags[token.i] return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 63495ec86..11f1ddf5f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -785,6 +785,8 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] + if array.dtype != numpy.uint64: + user_warning(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -872,7 +874,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ? + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ? if self.is_tagged: array_head.extend([TAG, POS]) # If doc parsed add head and dep attribute From 2281c4708cc3dfa68ffcdff5554c18d8fae0c9de Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 2 Mar 2020 11:55:02 +0100 Subject: [PATCH 04/46] Restore empty tokenizer properties (#5026) * Restore empty tokenizer properties * Check for types in tokenizer.from_bytes() * Add test for setting empty tokenizer rules --- spacy/tests/serialize/test_serialize_tokenizer.py | 11 +++++++++-- spacy/tokenizer.pyx | 14 +++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 9a273980c..0e0816a55 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -15,12 +15,19 @@ def load_tokenizer(b): def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): - """Test that custom tokenizer with not all functions defined can be - serialized and deserialized correctly (see #2494).""" + """Test that custom tokenizer with not all functions defined or empty + properties can be serialized and deserialized correctly (see #2494, + #4991).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]}) + tokenizer.rules = {} + tokenizer_bytes = tokenizer.to_bytes() + tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + assert tokenizer_reloaded.rules == {} + @pytest.mark.skip(reason="Currently unreliable across platforms") @pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"]) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 230f41921..12c7b73af 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -14,7 +14,7 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string -from .compat import unescape_unicode +from .compat import unescape_unicode, basestring_ from .attrs import intify_attrs from .symbols import ORTH @@ -568,22 +568,22 @@ cdef class Tokenizer: for key in ["prefix_search", "suffix_search", "infix_finditer"]: if key in data: data[key] = unescape_unicode(data[key]) - if data.get("prefix_search"): + if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): self.prefix_search = re.compile(data["prefix_search"]).search - if data.get("suffix_search"): + if "suffix_search" in data and isinstance(data["suffix_search"], basestring_): self.suffix_search = re.compile(data["suffix_search"]).search - if data.get("infix_finditer"): + if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_): self.infix_finditer = re.compile(data["infix_finditer"]).finditer - if data.get("token_match"): + if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match - if data.get("rules"): + if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} self._reset_cache([key for key in self._cache]) self._reset_specials() self._cache = PreshMap() self._specials = PreshMap() - self._load_special_tokenization(data.get("rules", {})) + self._load_special_tokenization(data["rules"]) return self From 697bec764de41e39582caadc14608607c2af8d09 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 12:22:39 +0100 Subject: [PATCH 05/46] Normalize IS_SENT_START to SENT_START for Matcher (#5080) --- spacy/matcher/_schemas.py | 4 ++++ spacy/matcher/matcher.pyx | 2 ++ spacy/tests/matcher/test_pattern_validation.py | 2 ++ 3 files changed, 8 insertions(+) diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 1b10f0dd5..4ef7ae49a 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -170,6 +170,10 @@ TOKEN_PATTERN_SCHEMA = { "title": "Token is the first in a sentence", "$ref": "#/definitions/boolean_value", }, + "SENT_START": { + "title": "Token is the first in a sentence", + "$ref": "#/definitions/boolean_value", + }, "LIKE_NUM": { "title": "Token resembles a number", "$ref": "#/definitions/boolean_value", diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 30ef3dd36..11461afb8 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -670,6 +670,8 @@ def _get_attr_values(spec, string_store): continue if attr == "TEXT": attr = "ORTH" + if attr == "IS_SENT_START": + attr = "SENT_START" if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: raise ValueError(Errors.E152.format(attr=attr)) attr = IDS.get(attr) diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 2db2f9eb3..c536698d0 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -34,6 +34,8 @@ TEST_PATTERNS = [ ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0), ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0), ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0), + ([{"IS_SENT_START": True}], 0, 0), + ([{"SENT_START": True}], 0, 0), ] XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)] From d078b47c81acdce5ece828f2f7d6e193bb3840ce Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 12:29:05 +0100 Subject: [PATCH 06/46] Break out of infinite loop as intended (#5077) --- spacy/tokens/doc.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 11f1ddf5f..5997be804 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1175,6 +1175,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) if loop_count > 10: user_warning(Warnings.W026) + break loop_count += 1 # Set sentence starts for i in range(length): From d307e9ca58c84dc24e6717fccafe7b55c604ee7c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 3 Mar 2020 13:58:22 +0100 Subject: [PATCH 07/46] take care of global vectors in multiprocessing (#5081) * restore load_nlp.VECTORS in the child process * add unit test * fix test * remove unnecessary import * add utf8 encoding * import unicode_literals --- spacy/_ml.py | 3 +-- spacy/language.py | 9 ++++++-- spacy/tests/regression/test_issue4725.py | 26 ++++++++++++++++++++++++ spacy/tests/regression/test_issue4849.py | 1 - spacy/tests/regression/test_issue4903.py | 2 -- 5 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 spacy/tests/regression/test_issue4725.py diff --git a/spacy/_ml.py b/spacy/_ml.py index 8695a88cc..fb7d39255 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -296,8 +296,7 @@ def link_vectors_to_models(vocab): key = (ops.device, vectors.name) if key in thinc.extra.load_nlp.VECTORS: if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: - # This is a hack to avoid the problem in #3853. Maybe we should - # print a warning as well? + # This is a hack to avoid the problem in #3853. old_name = vectors.name new_name = vectors.name + "_%d" % data.shape[0] user_warning(Warnings.W019.format(old=old_name, new=new_name)) diff --git a/spacy/language.py b/spacy/language.py index 16aa4967e..28fddfebb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,6 +3,9 @@ from __future__ import absolute_import, unicode_literals import random import itertools + +from thinc.extra import load_nlp + from spacy.util import minibatch import weakref import functools @@ -856,7 +859,7 @@ class Language(object): procs = [ mp.Process( target=_apply_pipes, - args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), + args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS), ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] @@ -1112,7 +1115,7 @@ def _pipe(docs, proc, kwargs): yield doc -def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): +def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors): """Worker for Language.pipe receiver (multiprocessing.Connection): Pipe to receive text. Usually @@ -1120,8 +1123,10 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): sender (multiprocessing.Connection): Pipe to send doc. Usually created by `multiprocessing.Pipe()` underscore_state (tuple): The data in the Underscore class of the parent + vectors (dict): The global vectors data, copied from the parent """ Underscore.load_state(underscore_state) + load_nlp.VECTORS = vectors while True: texts = receiver.get() docs = (make_doc(text) for text in texts) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py new file mode 100644 index 000000000..f80f19852 --- /dev/null +++ b/spacy/tests/regression/test_issue4725.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy + +from spacy.lang.en import English +from spacy.vocab import Vocab + + +def test_issue4725(): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + nlp.begin_training() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py index 85d03fe9a..834219773 100644 --- a/spacy/tests/regression/test_issue4849.py +++ b/spacy/tests/regression/test_issue4849.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from spacy.lang.en import English from spacy.pipeline import EntityRuler -from spacy.tokens.underscore import Underscore def test_issue4849(): diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py index 9a3c10d61..d467b1cd6 100644 --- a/spacy/tests/regression/test_issue4903.py +++ b/spacy/tests/regression/test_issue4903.py @@ -1,10 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -import spacy from spacy.lang.en import English from spacy.tokens import Span, Doc -from spacy.tokens.underscore import Underscore class CustomPipe: From a0998868ffe6d0d8d1a610374f537a4f41eda83e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 3 Mar 2020 13:58:56 +0100 Subject: [PATCH 08/46] prevent updating cfg if the Model was already defined (#5078) --- spacy/syntax/nn_parser.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 153ca67cd..d5c6bf2a8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -606,7 +606,6 @@ cdef class Parser: if not hasattr(get_gold_tuples, '__call__'): gold_tuples = get_gold_tuples get_gold_tuples = lambda: gold_tuples - cfg.setdefault('min_action_freq', 30) actions = self.moves.get_actions(gold_parses=get_gold_tuples(), min_freq=cfg.get('min_action_freq', 30), learn_tokens=self.cfg.get("learn_tokens", False)) @@ -616,8 +615,9 @@ cdef class Parser: if label not in actions[action]: actions[action][label] = freq self.moves.initialize_actions(actions) - cfg.setdefault('token_vector_width', 96) if self.model is True: + cfg.setdefault('min_action_freq', 30) + cfg.setdefault('token_vector_width', 96) self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() @@ -633,11 +633,11 @@ cdef class Parser: if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) + self.cfg.update(cfg) else: if sgd is None: sgd = self.create_optimizer() self.model.begin_training([]) - self.cfg.update(cfg) return sgd def to_disk(self, path, exclude=tuple(), **kwargs): From 8c20dae6f7b1d5ac056402e0057269ce80dba0fa Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 21:43:25 +0100 Subject: [PATCH 09/46] Fix model-final/model-best meta from train CLI (#5093) * Fix model-final/model-best meta * include speed and accuracy from final iteration * combine with speeds from base model if necessary * Include token_acc metric for all components --- spacy/cli/train.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 968a009f6..59b0f2225 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -554,7 +554,30 @@ def train( with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) - final_meta = srsly.read_json(output_path / "model-final" / "meta.json") + meta_loc = output_path / "model-final" / "meta.json" + final_meta = srsly.read_json(meta_loc) + final_meta.setdefault("accuracy", {}) + final_meta["accuracy"].update(meta.get("accuracy", {})) + final_meta.setdefault("speed", {}) + final_meta["speed"].setdefault("cpu", None) + final_meta["speed"].setdefault("gpu", None) + # combine cpu and gpu speeds with the base model speeds + if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: + speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) + final_meta["speed"]["cpu"] = speed + if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: + speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) + final_meta["speed"]["gpu"] = speed + # if there were no speeds to update, overwrite with meta + if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None: + final_meta["speed"].update(meta["speed"]) + # note: beam speeds are not combined with the base model + if has_beam_widths: + final_meta.setdefault("beam_accuracy", {}) + final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {})) + final_meta.setdefault("beam_speed", {}) + final_meta["beam_speed"].update(meta.get("beam_speed", {})) + srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) @@ -649,11 +672,11 @@ def _get_metrics(component): if component == "parser": return ("las", "uas", "las_per_type", "token_acc") elif component == "tagger": - return ("tags_acc",) + return ("tags_acc", "token_acc") elif component == "ner": - return ("ents_f", "ents_p", "ents_r", "ents_per_type") + return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc") elif component == "textcat": - return ("textcat_score",) + return ("textcat_score", "token_acc") return ("token_acc",) @@ -709,3 +732,12 @@ def _get_progress( if beam_width is not None: result.insert(1, beam_width) return result + + +def _get_total_speed(speeds): + seconds_per_word = 0.0 + for words_per_second in speeds: + if words_per_second is None: + return None + seconds_per_word += 1.0 / words_per_second + return 1.0 / seconds_per_word From 9be90dbca3a75ebbaa85ec14dd02fe3ab87291be Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 3 Mar 2020 21:44:51 +0100 Subject: [PATCH 10/46] Improve token head verification (#5079) * Improve token head verification Improve the verification for valid token heads when heads are set: * in `Token.head`: heads come from the same document * in `Doc.from_array()`: head indices are within the bounds of the document * Improve error message --- spacy/errors.py | 7 +++++++ spacy/tests/doc/test_array.py | 27 +++++++++++++++++++++++++++ spacy/tests/doc/test_token_api.py | 5 +++++ spacy/tokens/doc.pyx | 10 +++++++++- spacy/tokens/token.pyx | 3 +++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5957c5ecd..b43b8487f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -545,6 +545,13 @@ class Errors(object): "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") E189 = ("Each argument to `get_doc` should be of equal length.") + E190 = ("Token head out of range in `Doc.from_array()` for token index " + "'{index}' with value '{value}' (equivalent to relative head " + "index: '{rel_head_index}'). The head indices should be relative " + "to the current token index rather than absolute indices in the " + "array.") + E191 = ("Invalid head: the head token must be from the same doc as the " + "token itself.") @add_codes diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index aa0d37eca..1c0c79f6e 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -77,3 +77,30 @@ def test_doc_array_idx(en_vocab): assert offsets[0] == 0 assert offsets[1] == 3 assert offsets[2] == 11 + + +def test_doc_from_array_heads_in_bounds(en_vocab): + """Test that Doc.from_array doesn't set heads that are out of bounds.""" + words = ["This", "is", "a", "sentence", "."] + doc = Doc(en_vocab, words=words) + for token in doc: + token.head = doc[0] + + # correct + arr = doc.to_array(["HEAD"]) + doc_from_array = Doc(en_vocab, words=words) + doc_from_array.from_array(["HEAD"], arr) + + # head before start + arr = doc.to_array(["HEAD"]) + arr[0] = -1 + doc_from_array = Doc(en_vocab, words=words) + with pytest.raises(ValueError): + doc_from_array.from_array(["HEAD"], arr) + + # head after end + arr = doc.to_array(["HEAD"]) + arr[0] = 5 + doc_from_array = Doc(en_vocab, words=words) + with pytest.raises(ValueError): + doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index b7522bb98..8c749b26d 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -167,6 +167,11 @@ def test_doc_token_api_head_setter(en_tokenizer): assert doc[4].left_edge.i == 0 assert doc[2].left_edge.i == 0 + # head token must be from the same document + doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + with pytest.raises(ValueError): + doc[0].head = doc2[0] + def test_is_sent_start(en_tokenizer): doc = en_tokenizer("This is a sentence. This is another.") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5997be804..0c90929c3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -790,7 +790,7 @@ cdef class Doc: if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) - cdef int i, col + cdef int i, col, abs_head_index cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) @@ -804,6 +804,14 @@ cdef class Doc: attr_ids[i] = attr_id if len(array.shape) == 1: array = array.reshape((array.size, 1)) + # Check that all heads are within the document bounds + if HEAD in attrs: + col = attrs.index(HEAD) + for i in range(length): + # cast index to signed int + abs_head_index = numpy.int32(array[i, col]) + i + if abs_head_index < 0 or abs_head_index >= length: + raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col]))) # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA if TAG in attrs: col = attrs.index(TAG) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8b15a4223..8019e3b4f 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -623,6 +623,9 @@ cdef class Token: # This function sets the head of self to new_head and updates the # counters for left/right dependents and left/right corner for the # new and the old head + # Check that token is from the same document + if self.doc != new_head.doc: + raise ValueError(Errors.E191) # Do nothing if old head is new head if self.i + self.c.head == new_head.i: return From 03376c9d9bea0dd850bd2612521843f6c8f580ba Mon Sep 17 00:00:00 2001 From: Muhammad Irfan Date: Wed, 4 Mar 2020 11:58:56 +0500 Subject: [PATCH 11/46] Basque language added and tested. --- spacy/lang/eu/__init__.py | 30 +++++++++ spacy/lang/eu/examples.py | 16 +++++ spacy/lang/eu/lex_attrs.py | 80 +++++++++++++++++++++++ spacy/lang/eu/punctuation.py | 7 ++ spacy/lang/eu/stop_words.py | 108 +++++++++++++++++++++++++++++++ spacy/lang/eu/tag_map.py | 71 ++++++++++++++++++++ spacy/tests/conftest.py | 5 ++ spacy/tests/lang/eu/test_text.py | 16 +++++ 8 files changed, 333 insertions(+) create mode 100644 spacy/lang/eu/__init__.py create mode 100644 spacy/lang/eu/examples.py create mode 100644 spacy/lang/eu/lex_attrs.py create mode 100644 spacy/lang/eu/punctuation.py create mode 100644 spacy/lang/eu/stop_words.py create mode 100644 spacy/lang/eu/tag_map.py create mode 100644 spacy/tests/lang/eu/test_text.py diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py new file mode 100644 index 000000000..4f3338c1d --- /dev/null +++ b/spacy/lang/eu/__init__.py @@ -0,0 +1,30 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_SUFFIXES +from .tag_map import TAG_MAP + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG + + +class BasqueDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "eu" + + tokenizer_exceptions = BASE_EXCEPTIONS + tag_map = TAG_MAP + stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES + + +class Basque(Language): + lang = "eu" + Defaults = BasqueDefaults + + +__all__ = ["Basque"] diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py new file mode 100644 index 000000000..ec9f0dd06 --- /dev/null +++ b/spacy/lang/eu/examples.py @@ -0,0 +1,16 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.eu.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "", + "" +] diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py new file mode 100644 index 000000000..c11e913db --- /dev/null +++ b/spacy/lang/eu/lex_attrs.py @@ -0,0 +1,80 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +# Source http://mylanguages.org/basque_numbers.php + + +_num_words = """ +bat +bi +hiru +lau +bost +sei +zazpi +zortzi +bederatzi +hamar +hamaika +hamabi +hamahiru +hamalau +hamabost +hamasei +hamazazpi +Hemezortzi +hemeretzi +hogei +ehun +mila +milioi +""".split() + +# source https://www.google.com/intl/ur/inputtools/try/ + +_ordinal_words = """ +lehen +bigarren +hirugarren +laugarren +bosgarren +seigarren +zazpigarren +zortzigarren +bederatzigarren +hamargarren +hamaikagarren +hamabigarren +hamahirugarren +hamalaugarren +hamabosgarren +hamaseigarren +hamazazpigarren +hamazortzigarren +hemeretzigarren +hogeigarren +behin +""".split() + + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py new file mode 100644 index 000000000..b8b1a1c83 --- /dev/null +++ b/spacy/lang/eu/punctuation.py @@ -0,0 +1,7 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_SUFFIXES + + +_suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py new file mode 100644 index 000000000..208238961 --- /dev/null +++ b/spacy/lang/eu/stop_words.py @@ -0,0 +1,108 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# Source: https://github.com/stopwords-iso/stopwords-eu +# https://www.ranks.nl/stopwords/basque +# https://www.mustgo.com/worldlanguages/basque/ +STOP_WORDS = set( +""" +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten +""".split() +) diff --git a/spacy/lang/eu/tag_map.py b/spacy/lang/eu/tag_map.py new file mode 100644 index 000000000..2499d7e3e --- /dev/null +++ b/spacy/lang/eu/tag_map.py @@ -0,0 +1,71 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON + +TAG_MAP = { + ".": {POS: PUNCT, "PunctType": "peri"}, + ",": {POS: PUNCT, "PunctType": "comm"}, + "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, + "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, + "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, + '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + ":": {POS: PUNCT}, + "$": {POS: SYM, "Other": {"SymType": "currency"}}, + "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "AFX": {POS: ADJ, "Hyph": "yes"}, + "CC": {POS: CCONJ, "ConjType": "coor"}, + "CD": {POS: NUM, "NumType": "card"}, + "DT": {POS: DET}, + "EX": {POS: ADV, "AdvType": "ex"}, + "FW": {POS: X, "Foreign": "yes"}, + "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "IN": {POS: ADP}, + "JJ": {POS: ADJ, "Degree": "pos"}, + "JJR": {POS: ADJ, "Degree": "comp"}, + "JJS": {POS: ADJ, "Degree": "sup"}, + "LS": {POS: PUNCT, "NumType": "ord"}, + "MD": {POS: VERB, "VerbType": "mod"}, + "NIL": {POS: ""}, + "NN": {POS: NOUN, "Number": "sing"}, + "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, + "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, + "NNS": {POS: NOUN, "Number": "plur"}, + "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, + "POS": {POS: PART, "Poss": "yes"}, + "PRP": {POS: PRON, "PronType": "prs"}, + "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, + "RB": {POS: ADV, "Degree": "pos"}, + "RBR": {POS: ADV, "Degree": "comp"}, + "RBS": {POS: ADV, "Degree": "sup"}, + "RP": {POS: PART}, + "SP": {POS: SPACE}, + "SYM": {POS: SYM}, + "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "UH": {POS: INTJ}, + "VB": {POS: VERB, "VerbForm": "inf"}, + "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, + "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, + "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, + "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, + "VBZ": { + POS: VERB, + "VerbForm": "fin", + "Tense": "pres", + "Number": "sing", + "Person": 3, + }, + "WDT": {POS: ADJ, "PronType": "int|rel"}, + "WP": {POS: NOUN, "PronType": "int|rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, + "WRB": {POS: ADV, "PronType": "int|rel"}, + "ADD": {POS: X}, + "NFP": {POS: PUNCT}, + "GW": {POS: X}, + "XX": {POS: X}, + "BES": {POS: VERB}, + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 816970e61..fc89c2658 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -83,6 +83,11 @@ def es_tokenizer(): return get_lang_class("es").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def eu_tokenizer(): + return get_lang_class("eu").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def fi_tokenizer(): return get_lang_class("fi").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py new file mode 100644 index 000000000..e73917ffa --- /dev/null +++ b/spacy/tests/lang/eu/test_text.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_eu_tokenizer_handles_long_text(eu_tokenizer): + text = """ta nere guitarra estrenatu ondoren""" + tokens = eu_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)]) +def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): + tokens = eu_tokenizer(text) + assert len(tokens) == length From 224a7f8e94721a7af10e366773ce2c012a5b8f62 Mon Sep 17 00:00:00 2001 From: Muhammad Irfan Date: Wed, 4 Mar 2020 15:49:06 +0500 Subject: [PATCH 12/46] examples --- spacy/lang/eu/examples.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py index ec9f0dd06..f2d325d78 100644 --- a/spacy/lang/eu/examples.py +++ b/spacy/lang/eu/examples.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. @@ -9,8 +8,7 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - sentences = [ - "", - "" + "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", + "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira" ] From 4d655b1d45577ceeb0113616f6cc7590568e5a2b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 4 Mar 2020 13:50:37 +0100 Subject: [PATCH 13/46] Require srsly >=1.0.2 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index e908e25f8..ec30efc16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ thinc==7.4.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 -srsly>=1.0.1,<1.1.0 +srsly>=1.0.2,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 1429c77ac..e44e32bb2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = thinc==7.4.0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 - srsly>=1.0.1,<1.1.0 + srsly>=1.0.2,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 From 3440a72ecb188850bf4b08244c2041ac0d8109a7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Mar 2020 19:28:16 +0100 Subject: [PATCH 14/46] Update Makefile (#5099) --- Makefile | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 1be1c9794..13c9026b7 100644 --- a/Makefile +++ b/Makefile @@ -1,36 +1,37 @@ SHELL := /bin/bash -WHEELHOUSE := ./wheelhouse PYVER := 3.6 VENV := ./env$(PYVER) -version = $(shell "bin/get-version.sh") +version := $(shell "bin/get-version.sh") -dist/spacy-$(version).pex : wheelhouse/spacy-$(version)-*.whl - pex -f ./wheelhouse --no-index --disable-cache -m spacy -o dist/spacy-$(version).pex spacy==$(version) jsonschema - chmod a+rx dist/spacy-$(version).pex +dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema + chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o dist/pytest.pex pytest pytest-timeout mock - chmod a+rx dist/pytest.pex + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock + chmod a+rx $@ -wheelhouse/spacy-$(version)-%.whl : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* +wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex $(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse -$(VENV) : +$(VENV)/bin/pex : python$(PYVER) -m venv $(VENV) $(VENV)/bin/python -m pip install pex wheel .PHONY : clean test test : dist/spacy-$(version).pex dist/pytest.pex - PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x + ( . $(VENV)/bin/activate ; \ + PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x ; ) clean : setup.py - source env3.6/bin/activate rm -rf dist/* rm -rf ./wheelhouse + rm -rf $(VENV) python setup.py clean --all From 80004930ed098ec5b6bf9ecd081b96b1e7e7080f Mon Sep 17 00:00:00 2001 From: David Pollack Date: Thu, 5 Mar 2020 15:48:41 +0100 Subject: [PATCH 15/46] fix typo in svg file --- .github/contributors/dhpollack.md | 106 +++++++++++++++++++++++++++ website/src/images/logos/allenai.svg | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/dhpollack.md diff --git a/.github/contributors/dhpollack.md b/.github/contributors/dhpollack.md new file mode 100644 index 000000000..444d97d42 --- /dev/null +++ b/.github/contributors/dhpollack.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | David Pollack | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | Mar 5. 2020 | +| GitHub username | dhpollack | +| Website (optional) | | diff --git a/website/src/images/logos/allenai.svg b/website/src/images/logos/allenai.svg index 2879bef60..c00569bf8 100644 --- a/website/src/images/logos/allenai.svg +++ b/website/src/images/logos/allenai.svg @@ -1,6 +1,6 @@ - + From 1a2b8fc264efdc384c5497b97ee4b1f55675a3ec Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 6 Mar 2020 14:45:28 +0100 Subject: [PATCH 16/46] set vector of merged entity (#5085) * merge_entities sets the vector in the vocab for the merged token * add unit test * import unicode_literals * move code to _merge function * only set vector if vocab has non-zero vectors --- spacy/tests/regression/test_issue5082.py | 46 ++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 4 +++ 2 files changed, 50 insertions(+) create mode 100644 spacy/tests/regression/test_issue5082.py diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py new file mode 100644 index 000000000..efa5d39f2 --- /dev/null +++ b/spacy/tests/regression/test_issue5082.py @@ -0,0 +1,46 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy as np +from spacy.lang.en import English +from spacy.pipeline import EntityRuler + + +def test_issue5082(): + # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens + nlp = English() + vocab = nlp.vocab + array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32) + array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32) + array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32) + array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32) + array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32) + + vocab.set_vector("I", array1) + vocab.set_vector("like", array2) + vocab.set_vector("David", array3) + vocab.set_vector("Bowie", array4) + + text = "I like David Bowie" + ruler = EntityRuler(nlp) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]} + ] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + parsed_vectors_1 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_1) == 4 + np.testing.assert_array_equal(parsed_vectors_1[0], array1) + np.testing.assert_array_equal(parsed_vectors_1[1], array2) + np.testing.assert_array_equal(parsed_vectors_1[2], array3) + np.testing.assert_array_equal(parsed_vectors_1[3], array4) + + merge_ents = nlp.create_pipe("merge_entities") + nlp.add_pipe(merge_ents) + + parsed_vectors_2 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_2) == 3 + np.testing.assert_array_equal(parsed_vectors_2[0], array1) + np.testing.assert_array_equal(parsed_vectors_2[1], array2) + np.testing.assert_array_equal(parsed_vectors_2[2], array34) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a5d06491a..512ad73bc 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -213,6 +213,10 @@ def _merge(Doc doc, merges): new_orth = ''.join([t.text_with_ws for t in spans[token_index]]) if spans[token_index][-1].whitespace_: new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)] + # add the vector of the (merged) entity to the vocab + if not doc.vocab.get_vector(new_orth).any(): + if doc.vocab.vectors_length > 0: + doc.vocab.set_vector(new_orth, span.vector) token = tokens[token_index] lex = doc.vocab.get(doc.mem, new_orth) token.lex = lex From 993758c58fba9d4611223f5dd6dcdb203cf67bba Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 8 Mar 2020 13:22:25 +0100 Subject: [PATCH 17/46] Remove unnecessary iterator in Language.pipe (#5101) Remove iterator over `raw_texts` with `iterator.tee()` in `Language.pipe` that is never consumed and consumes memory unnecessarily. --- spacy/language.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 28fddfebb..f0928b1f9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -757,8 +757,6 @@ class Language(object): DOCS: https://spacy.io/api/language#pipe """ - # raw_texts will be used later to stop iterator. - texts, raw_texts = itertools.tee(texts) if is_python2 and n_process != 1: user_warning(Warnings.W023) n_process = 1 From 9dd98a4b2759f5231fcc3b2a09d16f27b79ab13b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 8 Mar 2020 13:24:19 +0100 Subject: [PATCH 18/46] Improve Makefile (#5105) * Explicitly upgrade pip * Include spacy-lookups-data in pex --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 13c9026b7..cf96d6294 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -22,7 +22,7 @@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex $(VENV)/bin/pex : python$(PYVER) -m venv $(VENV) - $(VENV)/bin/python -m pip install pex wheel + $(VENV)/bin/pip install -U pip setuptools pex wheel .PHONY : clean test From 31755630a7b33bc9c621c1e82cc0c09da84720d4 Mon Sep 17 00:00:00 2001 From: Yohei Tamura Date: Sun, 8 Mar 2020 21:24:38 +0900 Subject: [PATCH 19/46] fix typ (#5106) --- bin/wiki_entity_linking/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md index 56d0c1415..4e4af5c21 100644 --- a/bin/wiki_entity_linking/README.md +++ b/bin/wiki_entity_linking/README.md @@ -2,7 +2,7 @@ ### Step 1: Create a Knowledge Base (KB) and training data -Run `wikipedia_pretrain_kb.py` +Run `wikidata_pretrain_kb.py` * This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file** * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/ * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language) From 0345135167c882575e006bf434c9f8d8d81f9e12 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Sun, 8 Mar 2020 13:25:56 +0100 Subject: [PATCH 20/46] Tokenizer to_disk and from_disk now ensure paths (#5116) * Tokenizer to_disk and from_disk now ensure strings are converted to paths Fixes #5115 * Sign contributor agreement --- .github/contributors/mabraham.md | 89 ++++++++++++++++++++++++++++++++ spacy/tokenizer.pyx | 2 + 2 files changed, 91 insertions(+) create mode 100644 .github/contributors/mabraham.md diff --git a/.github/contributors/mabraham.md b/.github/contributors/mabraham.md new file mode 100644 index 000000000..c91c950a3 --- /dev/null +++ b/.github/contributors/mabraham.md @@ -0,0 +1,89 @@ + + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | | +| GitHub username | | +| Website (optional) | | \ No newline at end of file diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 12c7b73af..4da081259 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -508,6 +508,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#to_disk """ + path = util.ensure_path(path) with path.open("wb") as file_: file_.write(self.to_bytes(**kwargs)) @@ -521,6 +522,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#from_disk """ + path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() self.from_bytes(bytes_data, **kwargs) From 5f680042647ef7d0c71a5041f33558bf81e656d8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:05:00 +0100 Subject: [PATCH 21/46] Port over gitignore changes from develop Prevents stale files when switching branches --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 828258603..edcbba4d5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ corpora/ keys/ *.json.gz +# Tests +spacy/tests/package/setup.cfg +spacy/tests/package/pyproject.toml +spacy/tests/package/requirements.txt + # Website website/.cache/ website/public/ From 1d6aec805d5c03ad8a039466e98ed3a619e650c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Mar 2020 11:17:20 +0100 Subject: [PATCH 22/46] Fix formatting and update docs for v2.2.4 --- spacy/cli/debug_data.py | 25 ++++++++++++++++--------- website/docs/api/cli.md | 30 ++++++++++++++++++++---------- website/docs/api/doc.md | 22 ++++++++++++---------- website/docs/api/span.md | 30 ++++++++++++++++++++++++++---- website/docs/api/top-level.md | 32 ++++++++++++++++---------------- website/meta/languages.json | 2 ++ 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 0e12a594c..c5e1ff6cf 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000 @plac.annotations( + # fmt: off lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), + pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), + # fmt: on ) def debug_data( lang, @@ -235,13 +232,17 @@ def debug_data( if gold_train_data["ws_ents"]: msg.fail( - "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"]) + "{} invalid whitespace entity span(s)".format( + gold_train_data["ws_ents"] + ) ) has_ws_ents_error = True if gold_train_data["punct_ents"]: msg.warn( - "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"]) + "{} entity span(s) with punctuation".format( + gold_train_data["punct_ents"] + ) ) has_punct_ents_warning = True @@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline): if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 - if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]: + if label.startswith(("B-", "U-", "L-")) and doc[i].text in [ + ".", + "'", + "!", + "?", + ",", + ]: # punctuation entity: could be replaced by whitespace when training with noise, # so add a warning to alert the user to this unexpected side effect. data["punct_ents"] += 1 diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 2f7346491..e47695efb 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -184,16 +184,17 @@ low data labels and more. $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] ``` -| Argument | Type | Description | -| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| Argument | Type | Description | +| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | @@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | | `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. | | `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--replace-components`, `-R` | flag | Replace components from the base model. | | `--vectors`, `-v` | option | Model to load vectors from. | | `--n-iter`, `-n` | option | Number of iterations (default: `30`). | | `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | @@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | | `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | +| `--width`, `-cw` 2.2.4 | option | Width of CNN layers of `Tok2Vec` component. | +| `--conv-depth`, `-cd` 2.2.4 | option | Depth of CNN layers of `Tok2Vec` component. | +| `--cnn-window`, `-cW` 2.2.4 | option | Window size for CNN layers of `Tok2Vec` component. | +| `--cnn-pieces`, `-cP` 2.2.4 | option | Maxout size for CNN layers of `Tok2Vec` component. | +| `--use-chars`, `-chr` 2.2.4 | flag | Whether to use character-based embedding of `Tok2Vec` component. | +| `--bilstm-depth`, `-lstm` 2.2.4 | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). | +| `--embed-rows`, `-er` 2.2.4 | option | Number of embedding rows of `Tok2Vec` component. | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | @@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). | | `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. | | `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | | `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 4f948e425..87b854a8c 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to -compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs. -The Python-level `Token` and [`Span`](/api/span) objects are views of this -array, i.e. they don't own the data themselves. +compressed binary strings. The `Doc` object holds an array of +[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and +[`Span`](/api/span) objects are views of this array, i.e. they don't own the +data themselves. ## Doc.\_\_init\_\_ {#init tag="method"} @@ -197,13 +198,14 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the Span, e.g. for named entities. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Type | Description | +| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 64b77b89d..3833bbca9 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -172,6 +172,28 @@ Remove a previously registered extension. | `name` | unicode | Name of the extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +## Span.char_span {#char_span tag="method" new="2.2.4"} + +Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if +the character indices don't map to a valid span. + +> #### Example +> +> ```python +> doc = nlp("I like New York") +> span = doc[1:4].char_span(5, 13, label="GPE") +> assert span.text == "New York" +> ``` + +| Name | Type | Description | +| ----------- | ---------------------------------------- | --------------------------------------------------------------------- | +| `start` | int | The index of the first character of the span. | +| `end` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object or `None`. | + ## Span.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Type | Description | -| ----------------- | ----- | ---------------------------------------------------- | -| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | -| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | +| Name | Type | Description | +| ---------------- | ----- | ---------------------------------------------------- | +| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | +| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | ## Span.root {#root tag="property" model="parser"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 266df87f0..217c51794 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | unicode | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Type | Description | Default | +| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | +| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | +| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | +| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | +| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | +| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | +| `font` | unicode | Font name or font family for all text. | `'Arial'` | +| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | +| `arrow_stroke` | int | Width of arrow path in px. | `2` | +| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | +| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | +| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | +| `distance` | int | Distance between words in px. | `175` / `150` (compact) | #### Named Entity Visualizer options {#displacy_options-ent} diff --git a/website/meta/languages.json b/website/meta/languages.json index c22ddad69..8834aaddc 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -95,6 +95,8 @@ "has_examples": true }, { "code": "hr", "name": "Croatian", "has_examples": true }, + { "code": "eu", "name": "Basque", "has_examples": true }, + { "code": "yo", "name": "Yoruba", "has_examples": true }, { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, From eccf6b16866defc66db9869603e9597a4ecb82b5 Mon Sep 17 00:00:00 2001 From: Renaud Richardet Date: Mon, 9 Mar 2020 14:49:11 +0100 Subject: [PATCH 23/46] small typo in code sample --- website/docs/usage/rule-based-matching.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index f8866aec1..0ab74034e 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1119,7 +1119,7 @@ entityruler = EntityRuler(nlp) patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)] other_pipes = [p for p in nlp.pipe_names if p != "tagger"] -with nlp.disable_pipes(*disable_pipes): +with nlp.disable_pipes(*other_pipes): entityruler.add_patterns(patterns) ``` From 1724a4f75b3a1ee5ceec39bbaf14b82051c11e90 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 9 Mar 2020 18:08:18 +0100 Subject: [PATCH 24/46] additional information if doc is empty --- spacy/tests/matcher/test_matcher_api.py | 9 ++++++++- spacy/tokens/doc.pyx | 2 +- website/docs/api/doc.md | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index a826a0a0e..74d4b8b00 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -5,7 +5,7 @@ import pytest import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher -from spacy.tokens import Doc, Token +from spacy.tokens import Doc, Token, Span from ..doc.test_underscore import clean_underscore @@ -458,3 +458,10 @@ def test_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) + +def test_matcher_span(matcher): + text = "JavaScript is good but Java is better" + doc = Doc(matcher.vocab, words=text.split()) + span = Span(doc, 0, 3) + matches = matcher(span.as_doc()) + assert len(matches) == 1 \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0c90929c3..ec0cd66b8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -260,7 +260,7 @@ cdef class Doc: def is_nered(self): """Check if the document has named entities set. Will return True if *any* of the tokens has a named entity tag set (even if the others are - unknown values). + unknown values), or if the document is empty. """ if len(self) == 0: return True diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 87b854a8c..ab85c1deb 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -657,10 +657,10 @@ The L2 norm of the document's vector representation. | `user_data` | - | A generic storage area, for user custom data. | | `lang` 2.1 | int | Language of the document's vocabulary. | | `lang_` 2.1 | unicode | Language of the document's vocabulary. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | -| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. | +| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | +| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | +| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | +| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | | `sentiment` | float | The document's positivity/negativity score, if available. | | `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | | `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | From c4d030dbf68990e7af6b6a87d6add829906806bf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 9 Mar 2020 18:10:54 +0100 Subject: [PATCH 25/46] remove accidental commit --- spacy/tests/matcher/test_matcher_api.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 74d4b8b00..a826a0a0e 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -5,7 +5,7 @@ import pytest import re from mock import Mock from spacy.matcher import Matcher, DependencyMatcher -from spacy.tokens import Doc, Token, Span +from spacy.tokens import Doc, Token from ..doc.test_underscore import clean_underscore @@ -458,10 +458,3 @@ def test_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) - -def test_matcher_span(matcher): - text = "JavaScript is good but Java is better" - doc = Doc(matcher.vocab, words=text.split()) - span = Span(doc, 0, 3) - matches = matcher(span.as_doc()) - assert len(matches) == 1 \ No newline at end of file From ba47d5a5cb29297c653af1543f9dff9039dab449 Mon Sep 17 00:00:00 2001 From: Himanshu Garg <35988194+merrcury@users.noreply.github.com> Date: Tue, 10 Mar 2020 15:03:29 +0530 Subject: [PATCH 26/46] Update LICENSE Year --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 11221f687..87b814ce4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2019 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 27d1300bdb23afc407500f30ae5071889b2cf6de Mon Sep 17 00:00:00 2001 From: Himanshu Garg <35988194+merrcury@users.noreply.github.com> Date: Tue, 10 Mar 2020 15:11:07 +0530 Subject: [PATCH 27/46] Create merrcury.md --- .github/contributors/merrcury.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/merrcury.md diff --git a/.github/contributors/merrcury.md b/.github/contributors/merrcury.md new file mode 100644 index 000000000..056a790eb --- /dev/null +++ b/.github/contributors/merrcury.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Himanshu Garg | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-10 | +| GitHub username | merrcury | +| Website (optional) | | From 26a90f011b8c21dfc06940579479aaff8006ff74 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 12 Mar 2020 11:30:41 +0100 Subject: [PATCH 28/46] Set version to v2.2.4 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 365c2adbb..84dc86aa8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.4.dev0" +__version__ = "2.2.4" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 423849f94a09cb5979e5bb7953c576d6e50b1b3c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 13 Mar 2020 09:25:23 +0100 Subject: [PATCH 29/46] Fix sents comparison in test util Due to changes to `Span` (#5005), spans from different documents are now never equal. Check `Token.is_sent_start` values instead. --- spacy/tests/util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 52768dd41..a0d6273a9 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -116,8 +116,7 @@ def assert_docs_equal(doc1, doc2): assert [t.head.i for t in doc1] == [t.head.i for t in doc2] assert [t.dep for t in doc1] == [t.dep for t in doc2] - if doc1.is_parsed and doc2.is_parsed: - assert [s for s in doc1.sents] == [s for s in doc2.sents] + assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2] assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2] assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2] From a0ffa346c0371c6f2fd7c5ae7e9f5a26e36bfc76 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Fri, 13 Mar 2020 14:07:26 +0100 Subject: [PATCH 30/46] Fix broken link in docs --- website/docs/usage/saving-loading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 70983198f..8e2c30d82 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -94,7 +94,7 @@ docs = list(doc_bin.get_docs(nlp.vocab)) If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as well, which includes the values of -[extension attributes](/processing-pipelines#custom-components-attributes) (if +[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if they're serializable with msgpack). From 9cde7eb08c6f06683f0a0085835f8909ce2c56fe Mon Sep 17 00:00:00 2001 From: nihil <666@nabovarme.dk> Date: Fri, 13 Mar 2020 17:58:29 +0100 Subject: [PATCH 31/46] add spacy_syllables to universe + sign contributor agreement --- .github/contributors/sloev.md | 106 ++++++++++++++++++++++++++++++++++ website/meta/universe.json | 35 +++++++++++ 2 files changed, 141 insertions(+) create mode 100644 .github/contributors/sloev.md diff --git a/.github/contributors/sloev.md b/.github/contributors/sloev.md new file mode 100644 index 000000000..d151d4606 --- /dev/null +++ b/.github/contributors/sloev.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Johannes Valbjørn | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-13 | +| GitHub username | sloev | +| Website (optional) | https://sloev.github.io | diff --git a/website/meta/universe.json b/website/meta/universe.json index 0ff622521..91361e234 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1965,6 +1965,41 @@ }, "category": ["pipeline"], "tags": ["phrase extraction", "ner", "summarization", "graph algorithms", "textrank"] + }, + { + "id": "spacy_syllables", + "title": "Spacy Syllables", + "slogan": "Multilingual syllable annotations", + "description": "Spacy Syllables is a pipeline component that adds multilingual syllable annotations to Tokens. It uses Pyphen under the hood and has support for a long list of languages.", + "github": "sloev/spacy-syllables", + "pip": "spacy_syllables", + "code_example": [ + "import spacy", + "from spacy_syllables import SpacySyllables", + "", + "nlp = spacy.load('en_core_web_sm')", + "syllables = SpacySyllables(nlp)", + "nlp.add_pipe(syllables, after='tagger')", + "", + "doc = nlp('terribly long')", + "", + "data = [", + " (token.text, token._.syllables, token._.syllables_count)", + " for token in doc", + "]", + "", + "assert data == [", + " ('terribly', ['ter', 'ri', 'bly'], 3),", + " ('long', ['long'], 1)", + "]" + ], + "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png", + "author": "Johannes Valbjørn", + "author_links": { + "github": "sloev" + }, + "category": ["pipeline"], + "tags": ["syllables", "multilingual"] } ], From 36e35324759482744e97265e8b89768f4311cb1a Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 02:06:32 +0800 Subject: [PATCH 32/46] Remove unfinished sentence --- website/docs/usage/adding-languages.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 4b12c6be1..70411ec0b 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -622,13 +622,13 @@ categorizer is to use the [`spacy train`](/api/cli#train) command-line utility. In order to use this, you'll need training and evaluation data in the [JSON format](/api/annotation#json-input) spaCy expects for training. -You can now train the model using a corpus for your language annotated with If -your data is in one of the supported formats, the easiest solution might be to -use the [`spacy convert`](/api/cli#convert) command-line utility. This supports -several popular formats, including the IOB format for named entity recognition, -the JSONL format produced by our annotation tool [Prodigy](https://prodi.gy), -and the [CoNLL-U](http://universaldependencies.org/docs/format.html) format used -by the [Universal Dependencies](http://universaldependencies.org/) corpus. +If your data is in one of the supported formats, the easiest solution might be +to use the [`spacy convert`](/api/cli#convert) command-line utility. This +supports several popular formats, including the IOB format for named entity +recognition, the JSONL format produced by our annotation tool +[Prodigy](https://prodi.gy), and the +[CoNLL-U](http://universaldependencies.org/docs/format.html) format used by the +[Universal Dependencies](http://universaldependencies.org/) corpus. One thing to keep in mind is that spaCy expects to train its models from **whole documents**, not just single sentences. If your corpus only contains single From 7c3a4ce933edfe4084005a65e07373e47a9d48cb Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 03:11:17 +0800 Subject: [PATCH 33/46] Missing word in api/cli doc --- website/docs/api/cli.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e47695efb..28dc332ba 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -109,9 +109,9 @@ links) and check whether they are compatible with the currently installed version of spaCy. Should be run after upgrading spaCy via `pip install -U spacy` to ensure that all installed models are can be used with the new version. The command is also useful to detect out-of-sync model links resulting from links -created in different virtual environments. It will a list of models, the -installed versions, the latest compatible version (if out of date) and the -commands for updating. +created in different virtual environments. It will show a list of models and +their installed versions. If any model is out of date, the latest compatible +versions and command for updating are shown. > #### Automated validation > From 2124be100da49b828ce315aa802c79448536fa2b Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 03:14:51 +0800 Subject: [PATCH 34/46] Tweak run-on sentence --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 28dc332ba..f067ba5a7 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -176,7 +176,7 @@ All output files generated by this command are compatible with ## Debug data {#debug-data new="2.2"} -Analyze, debug and validate your training and development data, get useful +Analyze, debug, and validate your training and development data. Get useful stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. From 1ae01684cfa3d0530e687c8d4bcf3cbd44926030 Mon Sep 17 00:00:00 2001 From: Alan Chan Date: Sun, 15 Mar 2020 03:24:51 +0800 Subject: [PATCH 35/46] Fill in contributor agreement --- .github/contributors/pinealan.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/pinealan.md diff --git a/.github/contributors/pinealan.md b/.github/contributors/pinealan.md new file mode 100644 index 000000000..699b405e2 --- /dev/null +++ b/.github/contributors/pinealan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alan Chan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-15 | +| GitHub username | pinealan | +| Website (optional) | http://pinealan.xyz | From d2ffb406adf5ddcf68fdd6290c1a556517857392 Mon Sep 17 00:00:00 2001 From: Peter B <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 17 Mar 2020 08:30:29 -0400 Subject: [PATCH 36/46] =?UTF-8?q?add=20gobbli=20to=20spacy-universe=20?= =?UTF-8?q?=F0=9F=A5=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- website/meta/universe.json | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 91361e234..9138f8819 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2000,6 +2000,44 @@ }, "category": ["pipeline"], "tags": ["syllables", "multilingual"] + }, + { + "id": "gobbli", + "title": "gobbli", + "slogan": "Deep learning for text classification doesn't have to be scary", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models.", + "url": "https://github.com/rtiinternational/gobbli", + "github": "rtiinternational/gobbli", + "pip": "gobbli", + "thumb": "https://i.postimg.cc/NGpzhrdr/gobbli-lg.png", + "code_example": [ + "from gobbli.io import PredictInput, TrainInput", + "from gobbli.model.bert import BERT", + "", + "train_input = TrainInput(", + " X_train=['This is a training document.', 'This is another training document.'],", + " y_train=['0', '1'],", + " X_valid=['This is a validation sentence.', 'This is another validation sentence.'],", + " y_valid=['1', '0'],", + ")", + "", + "clf = BERT()", + "", + "# Set up classifier resources -- Docker image, etc.", + "clf.build()", + "", + "# Train model", + "train_output = clf.train(train_input)", + "", + "predict_input = PredictInput(", + " X=['Which class is this document?'],", + " labels=train_output.labels,", + " checkpoint=train_output.checkpoint,", + ")", + "", + "predict_output = clf.predict(predict_input)" + ], + "category": ["standalone"] } ], From b2b01a5c8bfd90a78f4c15e75c5cd60122389bb0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Mar 2020 19:53:31 +0100 Subject: [PATCH 37/46] Update universe.json [ci skip] --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 91361e234..56f4f31a3 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "spacy-stanza", + "title": "spacy-stanza", + "slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy", + "description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)", + "github": "explosion/spacy-stanza", + "thumb": "https://i.imgur.com/myhLjMJ.png", + "code_example": [ + "import stanza", + "from spacy_stanza import StanzaLanguage", + "", + "snlp = stanza.Pipeline(lang=\"en\")", + "nlp = StanzaLanguage(snlp)", + "", + "doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")", + "for token in doc:", + " print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)", + "print(doc.ents)" + ], + "category": ["pipeline", "standalone", "models", "research"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, { "id": "spacy-server", "title": "spaCy Server", From b04057c204882da80b7475a2fe78fa0f62b929a0 Mon Sep 17 00:00:00 2001 From: Peter B <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 17 Mar 2020 15:03:43 -0400 Subject: [PATCH 38/46] add mentions of spaCy use --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 9138f8819..c27f1b468 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2005,7 +2005,7 @@ "id": "gobbli", "title": "gobbli", "slogan": "Deep learning for text classification doesn't have to be scary", - "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models.", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. SpaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models available. In addition, spaCy is used for data augmentation and document embeddings.", "url": "https://github.com/rtiinternational/gobbli", "github": "rtiinternational/gobbli", "pip": "gobbli", From eda6eff8b10d9800199b160350e6d6f9d40521ca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Mar 2020 22:19:29 +0100 Subject: [PATCH 39/46] Update universe.json [ci skip] --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index d7c458c36..a1ae388a2 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2032,7 +2032,7 @@ "id": "gobbli", "title": "gobbli", "slogan": "Deep learning for text classification doesn't have to be scary", - "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. SpaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models available. In addition, spaCy is used for data augmentation and document embeddings.", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. spaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models. In addition, spaCy is used for data augmentation and document embeddings.", "url": "https://github.com/rtiinternational/gobbli", "github": "rtiinternational/gobbli", "pip": "gobbli", From 80e7e1347eb59b739503184fa4d69814f6f07954 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Mar 2020 22:21:34 +0100 Subject: [PATCH 40/46] Update universe.json [ci skip] --- website/meta/universe.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index a1ae388a2..23d052bb9 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -6,6 +6,7 @@ "slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy", "description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)", "github": "explosion/spacy-stanza", + "pip": "spacy-stanza", "thumb": "https://i.imgur.com/myhLjMJ.png", "code_example": [ "import stanza", From 3b53617a69287c45284d0aedc4c7fefcaa631662 Mon Sep 17 00:00:00 2001 From: Baciccin Date: Thu, 19 Mar 2020 21:20:17 -0700 Subject: [PATCH 41/46] Add Ligurian language --- .github/contributors/Baciccin.md | 106 +++++++++++++++++++++++++ spacy/lang/lij/__init__.py | 31 ++++++++ spacy/lang/lij/examples.py | 18 +++++ spacy/lang/lij/punctuation.py | 15 ++++ spacy/lang/lij/stop_words.py | 43 ++++++++++ spacy/lang/lij/tokenizer_exceptions.py | 52 ++++++++++++ website/meta/languages.json | 6 ++ 7 files changed, 271 insertions(+) create mode 100644 .github/contributors/Baciccin.md create mode 100644 spacy/lang/lij/__init__.py create mode 100644 spacy/lang/lij/examples.py create mode 100644 spacy/lang/lij/punctuation.py create mode 100644 spacy/lang/lij/stop_words.py create mode 100644 spacy/lang/lij/tokenizer_exceptions.py diff --git a/.github/contributors/Baciccin.md b/.github/contributors/Baciccin.md new file mode 100644 index 000000000..c7a940cb5 --- /dev/null +++ b/.github/contributors/Baciccin.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Giovanni Battista Parodi | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-19 | +| GitHub username | Baciccin | +| Website (optional) | | diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py new file mode 100644 index 000000000..9b4b29798 --- /dev/null +++ b/spacy/lang/lij/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class LigurianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lij" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + + +class Ligurian(Language): + lang = "lij" + Defaults = LigurianDefaults + + +__all__ = ["Ligurian"] diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py new file mode 100644 index 000000000..c4034ae7e --- /dev/null +++ b/spacy/lang/lij/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.lij.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Sciusciâ e sciorbî no se peu.", + "Graçie di çetroin, che me son arrivæ.", + "Vegnime apreuvo, che ve fasso pescâ di òmmi.", + "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.", +] diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py new file mode 100644 index 000000000..4439376c8 --- /dev/null +++ b/spacy/lang/lij/punctuation.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import ALPHA + + +ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") + + +_infixes = TOKENIZER_INFIXES + [ + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) +] + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py new file mode 100644 index 000000000..7ab34adf1 --- /dev/null +++ b/spacy/lang/lij/stop_words.py @@ -0,0 +1,43 @@ +# coding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set( + """ +a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei + +bella belle belli bello ben + +ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse + +d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo + +é e ê ea ean emmo en ëse + +fin fiña + +gh' ghe guæei + +i î in insemme int' inta inte inti into + +l' lê lì lô + +m' ma manco me megio meno mezo mi + +na n' ne ni ninte nisciun nisciuña no + +o ò ô oua + +parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio + +quæ quand' quande quarche quella quelle quelli quello + +s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto + +tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto + +un uña unn' unna + +za zu +""".split() +) diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py new file mode 100644 index 000000000..2aa6f8304 --- /dev/null +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -0,0 +1,52 @@ +# coding: utf8 +from __future__ import unicode_literals +from ...symbols import ORTH, LEMMA + +_exc = {} + +for raw, lemma in [ + ("a-a", "a-o"), + ("a-e", "a-o"), + ("a-o", "a-o"), + ("a-i", "a-o"), + ("co-a", "co-o"), + ("co-e", "co-o"), + ("co-i", "co-o"), + ("co-o", "co-o"), + ("da-a", "da-o"), + ("da-e", "da-o"), + ("da-i", "da-o"), + ("da-o", "da-o"), + ("pe-a", "pe-o"), + ("pe-e", "pe-o"), + ("pe-i", "pe-o"), + ("pe-o", "pe-o"), +]: + for orth in [raw, raw.capitalize()]: + _exc[orth] = [{ORTH: orth, LEMMA: lemma}] + +# Prefix + prepositions with à (e.g. "sott'a-o") + +for prep, prep_lemma in [ + ("a-a", "a-o"), + ("a-e", "a-o"), + ("a-o", "a-o"), + ("a-i", "a-o"), +]: + for prefix, prefix_lemma in [ + ("sott'", "sotta"), + ("sott’", "sotta"), + ("contr'", "contra"), + ("contr’", "contra"), + ("ch'", "che"), + ("ch’", "che"), + ("s'", "se"), + ("s’", "se"), + ]: + for prefix_orth in [prefix, prefix.capitalize()]: + _exc[prefix_orth+prep] = [ + {ORTH: prefix_orth, LEMMA: prefix_lemma}, + {ORTH: prep, LEMMA: prep_lemma}, + ] + +TOKENIZER_EXCEPTIONS = _exc diff --git a/website/meta/languages.json b/website/meta/languages.json index 8834aaddc..41c1bce7f 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -181,6 +181,12 @@ "name": "Vietnamese", "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] }, + { + "code": "lij", + "name": "Ligurian", + "example": "Sta chì a l'é unna fraxe.", + "has_examples": true + }, { "code": "xx", "name": "Multi-language", From 2897a73559ca1663d0e258604686e0134b9095d0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Mar 2020 19:23:47 +0100 Subject: [PATCH 42/46] Improve German tokenizer settings style --- spacy/lang/de/punctuation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index c376ce597..da6ab1d40 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import _prefixes, _suffixes +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES -_prefixes = ["``",] + list(_prefixes) +_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES _suffixes = ( ["''", "/"] From 30d862d4d891f0314cf5732aa798019f4b112369 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Mar 2020 19:52:57 +0100 Subject: [PATCH 43/46] Update from macOS-10.13 to macOS-10.14 --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 054365336..147d2e903 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,7 +48,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.6' Python36Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.6' # Don't test on 3.7 for now to speed up builds # Python37Linux: @@ -67,7 +67,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.8' Python38Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.8' maxParallel: 4 pool: From f8b4407a29df5cbf85f5b4179c8b4c1cdd847ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Gilli=C3=9Fen?= Date: Tue, 24 Mar 2020 10:22:12 +0100 Subject: [PATCH 44/46] Remove max_length parameter The parameter max_length is deprecated in PhraseMatcher, as stated here: https://spacy.io/api/phrasematcher#init --- examples/information_extraction/phrase_matcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py index cc6f46055..f3622bfdd 100644 --- a/examples/information_extraction/phrase_matcher.py +++ b/examples/information_extraction/phrase_matcher.py @@ -88,8 +88,8 @@ def read_text(bz2_loc, n=10000): break -def get_matches(tokenizer, phrases, texts, max_length=6): - matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) +def get_matches(tokenizer, phrases, texts): + matcher = PhraseMatcher(tokenizer.vocab) matcher.add("Phrase", None, *phrases) for text in texts: doc = tokenizer(text) From 5d067bcc5e480ed6b446e0e80ddf97b6a42cc80e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Gilli=C3=9Fen?= Date: Tue, 24 Mar 2020 10:42:10 +0100 Subject: [PATCH 45/46] Add SCA for guerda --- .github/contributors/guerda.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/guerda.md diff --git a/.github/contributors/guerda.md b/.github/contributors/guerda.md new file mode 100644 index 000000000..86eedd528 --- /dev/null +++ b/.github/contributors/guerda.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Philip Gillißen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-03-24 | +| GitHub username | guerda | +| Website (optional) | | From 128acb9ee143ee6888e05ec00aa78e7e44f97f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20Gilli=C3=9Fen?= Date: Tue, 24 Mar 2020 10:42:30 +0100 Subject: [PATCH 46/46] Update guerda.md --- .github/contributors/guerda.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/contributors/guerda.md b/.github/contributors/guerda.md index 86eedd528..6ac418e2e 100644 --- a/.github/contributors/guerda.md +++ b/.github/contributors/guerda.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [ ] I am signing on behalf of myself as an individual and no other person + * [x] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions.