From 1c0184258878d78b6058f6f6e3638da214f207b6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 11 Feb 2020 23:42:17 +0100 Subject: [PATCH 01/17] add pyx and pxd files to the distribution (#5000) --- MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 78655a5f4..1947b9140 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include include *.h -recursive-include spacy *.txt +recursive-include spacy *.txt *.pyx *.pxd include LICENSE include README.md include bin/spacy From 842dfddbb96e598d8e2b27b305d4d3dfa4d69d83 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 11 Feb 2020 23:44:56 +0100 Subject: [PATCH 02/17] Standardize Greek tag map setup (#4997) * Rename `tag_map.py` to `tag_map_fine.py` to indicate that it's not the default tag map * Remove duplicate generic UD tag map and load `../tag_map.py` instead --- spacy/lang/el/__init__.py | 2 +- spacy/lang/el/{tag_map.py => tag_map_fine.py} | 0 spacy/lang/el/tag_map_general.py | 27 ------------------- 3 files changed, 1 insertion(+), 28 deletions(-) rename spacy/lang/el/{tag_map.py => tag_map_fine.py} (100%) delete mode 100644 spacy/lang/el/tag_map_general.py diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 16863e6d7..6d551cc4e 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .tag_map_general import TAG_MAP +from ..tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import GreekLemmatizer diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map_fine.py similarity index 100% rename from spacy/lang/el/tag_map.py rename to spacy/lang/el/tag_map_fine.py diff --git a/spacy/lang/el/tag_map_general.py b/spacy/lang/el/tag_map_general.py deleted file mode 100644 index 42e64a013..000000000 --- a/spacy/lang/el/tag_map_general.py +++ /dev/null @@ -1,27 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ - - -TAG_MAP = { - "ADJ": {POS: ADJ}, - "ADV": {POS: ADV}, - "INTJ": {POS: INTJ}, - "NOUN": {POS: NOUN}, - "PROPN": {POS: PROPN}, - "VERB": {POS: VERB}, - "ADP": {POS: ADP}, - "CCONJ": {POS: CCONJ}, - "SCONJ": {POS: SCONJ}, - "PART": {POS: PART}, - "PUNCT": {POS: PUNCT}, - "SYM": {POS: SYM}, - "NUM": {POS: NUM}, - "PRON": {POS: PRON}, - "AUX": {POS: AUX}, - "SPACE": {POS: SPACE}, - "DET": {POS: DET}, - "X": {POS: X}, -} From 99a543367dc35b12aad00c4cd845ddd1f4870056 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 11 Feb 2020 23:45:41 +0100 Subject: [PATCH 03/17] Set GPU before loading any models in train CLI (#4989) Set the GPU before loading any existing models in the train CLI so that you can start with a base model and train on GPU. --- spacy/cli/train.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7de1d445d..0a9285863 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -14,6 +14,7 @@ import contextlib import random from .._ml import create_default_optimizer +from ..util import use_gpu as set_gpu from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus from ..compat import path2str @@ -147,6 +148,18 @@ def train( disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) + if use_gpu >= 0: + activated_gpu = None + try: + activated_gpu = set_gpu(use_gpu) + except Exception as e: + msg.warn("Exception: {}".format(e)) + if activated_gpu is not None: + msg.text("Using GPU: {}".format(use_gpu)) + else: + msg.warn("Unable to activate GPU: {}".format(use_gpu)) + msg.text("Using CPU only") + use_gpu = -1 if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) From 979a3fd1f51b7f3982f6d8c58ff327b122893913 Mon Sep 17 00:00:00 2001 From: nlptechbook <60931109+nlptechbook@users.noreply.github.com> Date: Sat, 15 Feb 2020 09:44:55 -0500 Subject: [PATCH 04/17] Update universe.json (#5022) e-book is available from https://nostarch.com/NLPPython --- website/meta/universe.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index cf5978edc..e0e48a916 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -999,6 +999,17 @@ "author": "Graphbrain", "category": ["standalone"] }, + { + "type": "education", + "id": "nostarch-nlp-python", + "title": "Natural Language Processing Using Python", + "slogan": "No Starch Press, 2020", + "description": "Natural Language Processing Using Python is an introduction to natural language processing (NLP), the task of converting human language into data that a computer can process. The book uses spaCy, a leading Python library for NLP, to guide readers through common NLP tasks related to generating and understanding human language with code. It addresses problems like understanding a user's intent, continuing a conversation with a human, and maintaining the state of a conversation.", + "cover": "https://nostarch.com/sites/default/files/styles/uc_product_full/public/NaturalLanguageProcessing_final_v01.jpg", + "url": "https://nostarch.com/NLPPython", + "author": "Yuli Vasiliev", + "category": ["books"] + }, { "type": "education", "id": "oreilly-python-ds", From ff8e71f46d8ee52a0ec94d973c2bcb87f57c563d Mon Sep 17 00:00:00 2001 From: Christos Aridas Date: Sat, 15 Feb 2020 16:49:09 +0200 Subject: [PATCH 05/17] Update streamlit app (#5017) * Update streamlit app [ci skip] * Add all labels by default * Tidy up and auto-format Co-authored-by: Ines Montani --- examples/streamlit_spacy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py index 1afa1cd32..a2da123c2 100644 --- a/examples/streamlit_spacy.py +++ b/examples/streamlit_spacy.py @@ -26,12 +26,12 @@ DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." HTML_WRAPPER = """
{}
""" -@st.cache(ignore_hash=True) +@st.cache(allow_output_mutation=True) def load_model(name): return spacy.load(name) -@st.cache(ignore_hash=True) +@st.cache(allow_output_mutation=True) def process_text(model_name, text): nlp = load_model(model_name) return nlp(text) @@ -79,7 +79,9 @@ if "ner" in nlp.pipe_names: st.header("Named Entities") st.sidebar.header("Named Entities") label_set = nlp.get_pipe("ner").labels - labels = st.sidebar.multiselect("Entity labels", label_set, label_set) + labels = st.sidebar.multiselect( + "Entity labels", options=label_set, default=list(label_set) + ) html = displacy.render(doc, style="ent", options={"ents": labels}) # Newlines seem to mess with the rendering html = html.replace("\n", " ") From a27c77ce62193fdd777353bbf93b20dc9eda142e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 15 Feb 2020 15:50:17 +0100 Subject: [PATCH 06/17] add message when cli train script throws exception (#5009) * add message when cli train script throws exception * fix formatting --- spacy/cli/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0a9285863..82d4da38e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -508,6 +508,8 @@ def train( "score = {}".format(best_score, current_score) ) break + except Exception as e: + msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e)) finally: best_pipes = nlp.pipe_names if disabled_pipes: From 257246017572433af7825d561de573dae73828f0 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 16 Feb 2020 17:16:41 +0100 Subject: [PATCH 07/17] add tok2vec parameters to train script to facilitate init_tok2vec (#5021) --- spacy/cli/pretrain.py | 14 ++++++++++---- spacy/cli/train.py | 45 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index c1aade2b2..aaec1ea75 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -34,7 +34,7 @@ from .train import _load_pretrained_tok2vec vectors_model=("Name or path to spaCy model with vectors to learn from"), output_dir=("Directory to write models to on each epoch", "positional", None, str), width=("Width of CNN layers", "option", "cw", int), - depth=("Depth of CNN layers", "option", "cd", int), + conv_depth=("Depth of CNN layers", "option", "cd", int), cnn_window=("Window size for CNN layers", "option", "cW", int), cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), use_chars=("Whether to use character-based embedding", "flag", "chr", bool), @@ -84,7 +84,7 @@ def pretrain( vectors_model, output_dir, width=96, - depth=4, + conv_depth=4, bilstm_depth=0, cnn_pieces=3, sa_depth=0, @@ -132,9 +132,15 @@ def pretrain( msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) + if output_dir.exists() and [p for p in output_dir.iterdir()]: + msg.warn( + "Output directory is not empty", + "It is better to use an empty directory or refer to a new output path, " + "then the new directory will be created for you.", + ) if not output_dir.exists(): output_dir.mkdir() - msg.good("Created output directory") + msg.good("Created output directory: {}".format(output_dir)) srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") @@ -162,7 +168,7 @@ def pretrain( Tok2Vec( width, embed_rows, - conv_depth=depth, + conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 82d4da38e..5af93a8f3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -33,6 +33,13 @@ from .. import about pipeline=("Comma-separated names of pipeline components", "option", "p", str), replace_components=("Replace components from base model", "flag", "R", bool), vectors=("Model to load vectors from", "option", "v", str), + width=("Width of CNN layers of Tok2Vec component", "option", "cw", int), + conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int), + cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int), + cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int), + use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool), + bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int), + embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int), n_iter=("Number of iterations", "option", "n", int), n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), n_examples=("Number of examples", "option", "ns", int), @@ -64,6 +71,13 @@ def train( pipeline="tagger,parser,ner", replace_components=False, vectors=None, + width=96, + conv_depth=4, + cnn_window=1, + cnn_pieces=3, + use_chars=False, + bilstm_depth=0, + embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, @@ -116,6 +130,7 @@ def train( ) if not output_path.exists(): output_path.mkdir() + msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. @@ -250,7 +265,15 @@ def train( optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training - optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) + cfg = {"device": use_gpu} + cfg["conv_depth"] = conv_depth + cfg["token_vector_width"] = width + cfg["bilstm_depth"] = bilstm_depth + cfg["cnn_maxout_pieces"] = cnn_pieces + cfg["embed_size"] = embed_rows + cfg["conv_window"] = cnn_window + cfg["subword_features"] = not use_chars + optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None @@ -375,13 +398,19 @@ def train( if not batch: continue docs, golds = zip(*batch) - nlp.update( - docs, - golds, - sgd=optimizer, - drop=next(dropout_rates), - losses=losses, - ) + try: + nlp.update( + docs, + golds, + sgd=optimizer, + drop=next(dropout_rates), + losses=losses, + ) + except ValueError as e: + msg.warn("Error during training") + if init_tok2vec: + msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?") + msg.fail("Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. From 5b102963bf67b6f49fe1c88d1e6fe9f337e6a621 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 16 Feb 2020 17:17:09 +0100 Subject: [PATCH 08/17] Require HEAD for is_parsed in Doc.from_array() (#5011) Modify flag settings so that `DEP` is not sufficient to set `is_parsed` and only run `set_children_from_heads()` if `HEAD` is provided. Then the combination `[SENT_START, DEP]` will set deps and not clobber sent starts with a lot of one-word sentences. --- spacy/tests/doc/test_doc_api.py | 35 ++++++++++++++++++++++++++++++++- spacy/tokens/doc.pyx | 2 +- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 86c7fbf72..52f856d3e 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -7,7 +7,7 @@ import numpy from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.errors import ModelsWarning -from spacy.attrs import ENT_TYPE, ENT_IOB +from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP from ..util import get_doc @@ -274,6 +274,39 @@ def test_doc_is_nered(en_vocab): assert new_doc.is_nered +def test_doc_from_array_sent_starts(en_vocab): + words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] + heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] + deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] + doc = Doc(en_vocab, words=words) + for i, (dep, head) in enumerate(zip(deps, heads)): + doc[i].dep_ = dep + doc[i].head = doc[head] + if head == i: + doc[i].is_sent_start = True + doc.is_parsed + + attrs = [SENT_START, HEAD] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + with pytest.raises(ValueError): + new_doc.from_array(attrs, arr) + + attrs = [SENT_START, DEP] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + new_doc.from_array(attrs, arr) + assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] + assert not new_doc.is_parsed + + attrs = [HEAD, DEP] + arr = doc.to_array(attrs) + new_doc = Doc(en_vocab, words=words) + new_doc.from_array(attrs, arr) + assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] + assert new_doc.is_parsed + + def test_doc_lang(en_vocab): doc = Doc(en_vocab, words=["Hello", "world"]) assert doc.lang_ == "en" diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4aee21153..04e02fd98 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -813,7 +813,7 @@ cdef class Doc: if attr_ids[j] != TAG: Token.set_struct_attr(token, attr_ids[j], array[i, j]) # Set flags - self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs) + self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) # If document is parsed, set children if self.is_parsed: From 0c47a53b5ece01d5740eea7203400b0f90ce2f15 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 16 Feb 2020 17:19:41 +0100 Subject: [PATCH 09/17] Use int only in key2row for better performance (#4990) Cast all keys and rows to `int` in `vectors.key2row` for more efficient access and serialization. --- spacy/vectors.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6b26bf123..c6526b89d 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -283,7 +283,11 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#add """ - key = get_string_id(key) + # use int for all keys and rows in key2row for more efficient access + # and serialization + key = int(get_string_id(key)) + if row is not None: + row = int(row) if row is None and key in self.key2row: row = self.key2row[key] elif row is None: From 3b22eb651be9e80160efa4fcdbd453a71b6de857 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sun, 16 Feb 2020 17:20:36 +0100 Subject: [PATCH 10/17] Sync Span __eq__ and __hash__ (#5005) * Sync Span __eq__ and __hash__ Use the same tuple for `__eq__` and `__hash__`, including all attributes except `vector` and `vector_norm`. * Update entity comparison in tests Update `assert_docs_equal()` test util to compare `Span` properties for ents rather than `Span` objects. --- spacy/tests/doc/test_span.py | 9 +++++++++ spacy/tests/util.py | 6 +++++- spacy/tokens/span.pyx | 13 +++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 01bb93c50..917f22e9c 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -279,3 +279,12 @@ def test_filter_spans(doc): assert len(filtered[1]) == 5 assert filtered[0].start == 1 and filtered[0].end == 4 assert filtered[1].start == 5 and filtered[1].end == 10 + + +def test_span_eq_hash(doc, doc_not_parsed): + assert doc[0:2] == doc[0:2] + assert doc[0:2] != doc[1:3] + assert doc[0:2] != doc_not_parsed[0:2] + assert hash(doc[0:2]) == hash(doc[0:2]) + assert hash(doc[0:2]) != hash(doc[1:3]) + assert hash(doc[0:2]) != hash(doc_not_parsed[0:2]) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 175480fe7..9ee5b89f8 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -95,7 +95,11 @@ def assert_docs_equal(doc1, doc2): assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2] assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2] - assert [ent for ent in doc1.ents] == [ent for ent in doc2.ents] + for ent1, ent2 in zip(doc1.ents, doc2.ents): + assert ent1.start == ent2.start + assert ent1.end == ent2.end + assert ent1.label == ent2.label + assert ent1.kb_id == ent2.kb_id def assert_packed_msg_equal(b1, b2): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 24857790b..35c70f236 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -127,22 +127,27 @@ cdef class Span: return False else: return True - # Eq + # < if op == 0: return self.start_char < other.start_char + # <= elif op == 1: return self.start_char <= other.start_char + # == elif op == 2: - return self.start_char == other.start_char and self.end_char == other.end_char + return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) == (other.doc, other.start_char, other.end_char, other.label, other.kb_id) + # != elif op == 3: - return self.start_char != other.start_char or self.end_char != other.end_char + return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) != (other.doc, other.start_char, other.end_char, other.label, other.kb_id) + # > elif op == 4: return self.start_char > other.start_char + # >= elif op == 5: return self.start_char >= other.start_char def __hash__(self): - return hash((self.doc, self.label, self.start_char, self.end_char)) + return hash((self.doc, self.start_char, self.end_char, self.label, self.kb_id)) def __len__(self): """Get the number of tokens in the span. From 72c964bcf408f34ecf7e9da94404213edec3e9e6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 16 Feb 2020 17:21:18 +0100 Subject: [PATCH 11/17] define pretrained_dims which is used by build_text_classifier (#5004) --- spacy/language.py | 1 + spacy/pipeline/pipes.pyx | 1 + 2 files changed, 2 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 5544b6341..869fa09a7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -608,6 +608,7 @@ class Language(object): link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: cfg["pretrained_vectors"] = self.vocab.vectors.name + cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] if sgd is None: sgd = create_default_optimizer(Model.ops) self._optimizer = sgd diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b4fecf5cb..3b190debe 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1044,6 +1044,7 @@ class TextCategorizer(Pipe): self.add_label(cat) if self.model is True: self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") + self.cfg["pretrained_dims"] = kwargs.get("pretrained_dims") self.require_labels() self.model = self.Model(len(self.labels), **self.cfg) link_vectors_to_models(self.vocab) From f6ed07b85c0b9204b5d388eb91da5cee30d5b842 Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Sun, 16 Feb 2020 09:17:47 -0800 Subject: [PATCH 12/17] Use nlp.pipe in EntityRuler for phrase patterns in add_patterns (#4931) * Fix ent_ids and labels properties when id attribute used in patterns * use set for labels * sort end_ids for comparison in entity_ruler tests * fixing entity_ruler ent_ids test * add to set * Run make_doc optimistically if using phrase matcher patterns. * remove unused coveragerc I was testing with * format * Refactor EntityRuler.add_patterns to use nlp.pipe for phrase patterns. Improves speed substantially. * Removing old add_patterns function * Fixing spacing * Make sure token_patterns loaded as well, before generator was being emptied in from_disk --- spacy/pipeline/entityruler.py | 41 +++++++++++++++++++++-- website/docs/usage/rule-based-matching.md | 27 +++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 1c8429049..c3ef429e9 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -8,7 +8,7 @@ from ..language import component from ..errors import Errors from ..compat import basestring_ from ..util import ensure_path, to_disk, from_disk -from ..tokens import Span +from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher DEFAULT_ENT_ID_SEP = "||" @@ -162,6 +162,7 @@ class EntityRuler(object): @property def patterns(self): """Get all patterns that were added to the entity ruler. + RETURNS (list): The original patterns, one dictionary per pattern. DOCS: https://spacy.io/api/entityruler#patterns @@ -194,6 +195,7 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#add_patterns """ + # disable the nlp components after this one in case they hadn't been initialized / deserialised yet try: current_index = self.nlp.pipe_names.index(self.name) @@ -203,7 +205,33 @@ class EntityRuler(object): except ValueError: subsequent_pipes = [] with self.nlp.disable_pipes(subsequent_pipes): + token_patterns = [] + phrase_pattern_labels = [] + phrase_pattern_texts = [] + phrase_pattern_ids = [] + for entry in patterns: + if isinstance(entry["pattern"], basestring_): + phrase_pattern_labels.append(entry["label"]) + phrase_pattern_texts.append(entry["pattern"]) + phrase_pattern_ids.append(entry.get("id")) + elif isinstance(entry["pattern"], list): + token_patterns.append(entry) + + phrase_patterns = [] + for label, pattern, ent_id in zip( + phrase_pattern_labels, + self.nlp.pipe(phrase_pattern_texts), + phrase_pattern_ids + ): + phrase_pattern = { + "label": label, "pattern": pattern, "id": ent_id + } + if ent_id: + phrase_pattern["id"] = ent_id + phrase_patterns.append(phrase_pattern) + + for entry in token_patterns + phrase_patterns: label = entry["label"] if "id" in entry: ent_label = label @@ -212,8 +240,8 @@ class EntityRuler(object): self._ent_ids[key] = (ent_label, entry["id"]) pattern = entry["pattern"] - if isinstance(pattern, basestring_): - self.phrase_patterns[label].append(self.nlp(pattern)) + if isinstance(pattern, Doc): + self.phrase_patterns[label].append(pattern) elif isinstance(pattern, list): self.token_patterns[label].append(pattern) else: @@ -226,6 +254,8 @@ class EntityRuler(object): def _split_label(self, label): """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep + label (str): The value of label in a pattern entry + RETURNS (tuple): ent_label, ent_id """ if self.ent_id_sep in label: @@ -239,6 +269,9 @@ class EntityRuler(object): def _create_label(self, label, ent_id): """Join Entity label with ent_id if the pattern has an `id` attribute + label (str): The label to set for ent.label_ + ent_id (str): The label + RETURNS (str): The ent_label joined with configured `ent_id_sep` """ if isinstance(ent_id, basestring_): @@ -250,6 +283,7 @@ class EntityRuler(object): patterns_bytes (bytes): The bytestring to load. **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_bytes @@ -292,6 +326,7 @@ class EntityRuler(object): path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index cae4f074a..f8866aec1 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1096,6 +1096,33 @@ with the patterns. When you load the model back in, all pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful model packages with binary weights _and_ rules included! +### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} + +When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, +the EntityRuler calls the nlp object to construct a doc object. This happens in case you try +to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to +extract matches based on the pattern's POS signature. + +In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. + +Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. + +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. + +Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. + +An easy workaround to make this function run faster is disabling the other language pipes +while adding the phrase patterns. + +```python +entityruler = EntityRuler(nlp) +patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)] + +other_pipes = [p for p in nlp.pipe_names if p != "tagger"] +with nlp.disable_pipes(*disable_pipes): + entityruler.add_patterns(patterns) +``` + ## Combining models and rules {#models-rules} You can combine statistical and rule-based components in a variety of ways. From c7e4fe9c5c979aad5327888c84f77db7a3da38cb Mon Sep 17 00:00:00 2001 From: Jan Jessewitsch <61113983+Jan-711@users.noreply.github.com> Date: Mon, 17 Feb 2020 18:59:22 +0100 Subject: [PATCH 13/17] Fix/Improve german stop words (#5024) * Fix german stop words Two stop words ("einige" and "einigen") are sticking together. Remove three nouns that may serve as stop words in a specific context (e.g. religious or news) but are not applicable for general use. * Create Jan-711.md --- .github/contributors/Jan-711.md | 106 ++++++++++++++++++++++++++++++++ spacy/lang/de/stop_words.py | 9 ++- 2 files changed, 110 insertions(+), 5 deletions(-) create mode 100644 .github/contributors/Jan-711.md diff --git a/.github/contributors/Jan-711.md b/.github/contributors/Jan-711.md new file mode 100644 index 000000000..60297640c --- /dev/null +++ b/.github/contributors/Jan-711.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jan Jessewitsch | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 16.02.2020 | +| GitHub username | Jan-711 | +| Website (optional) | | diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index cf3204d5e..69134124f 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -22,14 +22,14 @@ dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft durfte durften eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine -einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en +einem einen einer eines einige einigen einiger einiges einmal einmaleins elf en ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch früher fünf fünfte fünften fünfter fünftes für gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige -gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen +gewesen gewollt geworden gibt ging gleich gross groß grosse große grossen großen grosser großer grosses großes gut gute guter gutes habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier @@ -47,9 +47,8 @@ kleines kommen kommt können könnt konnte könnte konnten kurz lang lange leicht leider lieber los machen macht machte mag magst man manche manchem manchen mancher manches mehr -mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel -mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst -musste mussten +mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten +mögen möglich mögt morgen muss muß müssen musst müsst musste mussten na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter neuntes nicht nichts nie niemand niemandem niemanden noch nun nur From 2164e71ea87941aade217b621a4fbc002e758742 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 19 Feb 2020 16:16:00 +0100 Subject: [PATCH 14/17] Improved Romanian tokenization for UD RRT (#5036) Modifications to Romanian tokenization to improve tokenization for UD_Romanian-RRT. --- spacy/lang/ro/__init__.py | 5 + spacy/lang/ro/punctuation.py | 164 ++++++++++++++++++++++++++ spacy/lang/ro/tokenizer_exceptions.py | 47 +++++++- 3 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/ro/punctuation.py diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 6c325b74d..c7b744ca5 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -24,6 +26,9 @@ class RomanianDefaults(Language.Defaults): ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES tag_map = TAG_MAP diff --git a/spacy/lang/ro/punctuation.py b/spacy/lang/ro/punctuation.py new file mode 100644 index 000000000..87f9a1248 --- /dev/null +++ b/spacy/lang/ro/punctuation.py @@ -0,0 +1,164 @@ +# coding: utf8 +from __future__ import unicode_literals + +import itertools + +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY +from ..char_classes import LIST_ICONS, CURRENCY +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT + + +_list_icons = [x for x in LIST_ICONS if x != "°"] +_list_icons = [x.replace("\\u00B0", "") for x in _list_icons] + + +_ro_variants = { + "Ă": ["Ă", "A"], + "Â": ["Â", "A"], + "Î": ["Î", "I"], + "Ș": ["Ș", "Ş", "S"], + "Ț": ["Ț", "Ţ", "T"], +} + + +def _make_ro_variants(tokens): + variants = [] + for token in tokens: + upper_token = token.upper() + upper_char_variants = [_ro_variants.get(c, [c]) for c in upper_token] + upper_variants = ["".join(x) for x in itertools.product(*upper_char_variants)] + for variant in upper_variants: + variants.extend([variant, variant.lower(), variant.title()]) + return sorted(list(set(variants))) + + +# UD_Romanian-RRT closed class prefixes +# POS: ADP|AUX|CCONJ|DET|NUM|PART|PRON|SCONJ +_ud_rrt_prefixes = [ + "a-", + "c-", + "ce-", + "cu-", + "d-", + "de-", + "dintr-", + "e-", + "făr-", + "i-", + "l-", + "le-", + "m-", + "mi-", + "n-", + "ne-", + "p-", + "pe-", + "prim-", + "printr-", + "s-", + "se-", + "te-", + "v-", + "într-", + "ș-", + "și-", + "ți-", +] +_ud_rrt_prefix_variants = _make_ro_variants(_ud_rrt_prefixes) + + +# UD_Romanian-RRT closed class suffixes without NUM +# POS: ADP|AUX|CCONJ|DET|PART|PRON|SCONJ +_ud_rrt_suffixes = [ + "-a", + "-aceasta", + "-ai", + "-al", + "-ale", + "-alta", + "-am", + "-ar", + "-astea", + "-atâta", + "-au", + "-aș", + "-ați", + "-i", + "-ilor", + "-l", + "-le", + "-lea", + "-mea", + "-meu", + "-mi", + "-mă", + "-n", + "-ndărătul", + "-ne", + "-o", + "-oi", + "-or", + "-s", + "-se", + "-si", + "-te", + "-ul", + "-ului", + "-un", + "-uri", + "-urile", + "-urilor", + "-veți", + "-vă", + "-ăștia", + "-și", + "-ți", +] +_ud_rrt_suffix_variants = _make_ro_variants(_ud_rrt_suffixes) + + +_prefixes = ( + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + + _ud_rrt_prefix_variants + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) + + +_suffixes = ( + _ud_rrt_suffix_variants + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + _list_icons + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + +_infixes = ( + LIST_ELLIPSES + + _list_icons + + [ + r"(?<=[0-9])[+\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA), + ] +) + +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py index a7fb38453..b27344d2a 100644 --- a/spacy/lang/ro/tokenizer_exceptions.py +++ b/spacy/lang/ro/tokenizer_exceptions.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import ORTH +from .punctuation import _make_ro_variants _exc = {} @@ -45,8 +46,52 @@ for orth in [ "dpdv", "șamd.", "ș.a.m.d.", + # below: from UD_Romanian-RRT: + "A.c.", + "A.f.", + "A.r.", + "Al.", + "Art.", + "Aug.", + "Bd.", + "Dem.", + "Dr.", + "Fig.", + "Fr.", + "Gh.", + "Gr.", + "Lt.", + "Nr.", + "Obs.", + "Prof.", + "Sf.", + "a.m.", + "a.r.", + "alin.", + "art.", + "d-l", + "d-lui", + "d-nei", + "ex.", + "fig.", + "ian.", + "lit.", + "lt.", + "p.a.", + "p.m.", + "pct.", + "prep.", + "sf.", + "tel.", + "univ.", + "îngr.", + "într-adevăr", + "Șt.", + "ș.a.", ]: - _exc[orth] = [{ORTH: orth}] + # note: does not distinguish capitalized-only exceptions from others + for variant in _make_ro_variants([orth]): + _exc[variant] = [{ORTH: variant}] TOKENIZER_EXCEPTIONS = _exc From 479bd8d09fd22118463706363e7d23b0578ceea9 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 22 Feb 2020 14:11:51 +0100 Subject: [PATCH 15/17] add lemma option to displacy 'dep' visualiser (#5041) * add lemma option to displacy 'dep' visualiser * more compact list comprehension * add option to doc * fix test and add lemmas to util.get_doc * fix capital * remove lemma from get_doc * cleanup --- spacy/displacy/__init__.py | 10 ++++++---- spacy/displacy/render.py | 8 +++++--- spacy/displacy/templates.py | 9 +++++++++ spacy/tests/test_displacy.py | 8 ++++---- website/docs/api/top-level.md | 1 + 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index c17b80aef..e13b0403b 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -144,10 +144,12 @@ def parse_deps(orig_doc, options={}): for span, tag, lemma, ent_type in spans: attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} retokenizer.merge(span, attrs=attrs) - if options.get("fine_grained"): - words = [{"text": w.text, "tag": w.tag_} for w in doc] - else: - words = [{"text": w.text, "tag": w.pos_} for w in doc] + fine_grained = options.get("fine_grained") + add_lemma = options.get("add_lemma") + words = [{"text": w.text, + "tag": w.tag_ if fine_grained else w.pos_, + "lemma": w.lemma_ if add_lemma else None} for w in doc] + arcs = [] for word in doc: if word.i < word.head.i: diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d6e33437b..68df324d6 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import uuid -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS +from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from ..util import minify_html, escape_html, registry from ..errors import Errors @@ -83,7 +83,7 @@ class DependencyRenderer(object): self.width = self.offset_x + len(words) * self.distance self.height = self.offset_y + 3 * self.word_spacing self.id = render_id - words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)] + words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)] arcs = [ self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i) for i, a in enumerate(arcs) @@ -101,7 +101,7 @@ class DependencyRenderer(object): lang=self.lang, ) - def render_word(self, text, tag, i): + def render_word(self, text, tag, lemma, i,): """Render individual word. text (unicode): Word text. @@ -114,6 +114,8 @@ class DependencyRenderer(object): if self.direction == "rtl": x = self.width - x html_text = escape_html(text) + if lemma is not None: + return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) def render_arrow(self, label, start, end, direction, i): diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ade75d1d6..f29eab86f 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -18,6 +18,15 @@ TPL_DEP_WORDS = """ """ +TPL_DEP_WORDS_LEMMA = """ + + {text} + {lemma} + {tag} + +""" + + TPL_DEP_ARCS = """ diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 2d1f1bd8f..d04c0506f 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab): deps = displacy.parse_deps(doc) assert isinstance(deps, dict) assert deps["words"] == [ - {"text": "This", "tag": "DET"}, - {"text": "is", "tag": "AUX"}, - {"text": "a", "tag": "DET"}, - {"text": "sentence", "tag": "NOUN"}, + {"lemma": None, "text": "This", "tag": "DET"}, + {"lemma": None, "text": "is", "tag": "AUX"}, + {"lemma": None, "text": "a", "tag": "DET"}, + {"lemma": None, "text": "sentence", "tag": "NOUN"}, ] assert deps["arcs"] == [ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 50ba0e3d9..266df87f0 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -239,6 +239,7 @@ If a setting is not present in the options, the default value will be used. | Name | Type | Description | Default | | ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | | `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` | | `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | | `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | | `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | From 44f4142ce4b402aac4af650a6b819bfe5b5edbde Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 22 Feb 2020 14:12:32 +0100 Subject: [PATCH 16/17] add two abbreviations and some additional unit tests (#5040) --- spacy/lang/fi/tokenizer_exceptions.py | 2 ++ spacy/tests/lang/fi/test_tokenizer.py | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 5469e345e..7cdc7cf11 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -14,6 +14,7 @@ for exc_data in [ {ORTH: "alv.", LEMMA: "arvonlisävero"}, {ORTH: "ark.", LEMMA: "arkisin"}, {ORTH: "as.", LEMMA: "asunto"}, + {ORTH: "eaa.", LEMMA: "ennen ajanlaskun alkua"}, {ORTH: "ed.", LEMMA: "edellinen"}, {ORTH: "esim.", LEMMA: "esimerkki"}, {ORTH: "huom.", LEMMA: "huomautus"}, @@ -27,6 +28,7 @@ for exc_data in [ {ORTH: "läh.", LEMMA: "lähettäjä"}, {ORTH: "miel.", LEMMA: "mieluummin"}, {ORTH: "milj.", LEMMA: "miljoona"}, + {ORTH: "Mm.", LEMMA: "muun muassa"}, {ORTH: "mm.", LEMMA: "muun muassa"}, {ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "n.", LEMMA: "noin"}, diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index aab063982..301b85d74 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -10,28 +10,33 @@ ABBREVIATION_TESTS = [ ["Hyvää", "uutta", "vuotta", "t.", "siht.", "Niemelä", "!"], ), ("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]), + ( + "Vuonna 1 eaa. tapahtui kauheita.", + ["Vuonna", "1", "eaa.", "tapahtui", "kauheita", "."], + ), ] HYPHENATED_TESTS = [ ( - "1700-luvulle sijoittuva taide-elokuva", - ["1700-luvulle", "sijoittuva", "taide-elokuva"], + "1700-luvulle sijoittuva taide-elokuva Wikimedia-säätiön Varsinais-Suomen", + [ + "1700-luvulle", + "sijoittuva", + "taide-elokuva", + "Wikimedia-säätiön", + "Varsinais-Suomen", + ], ) ] ABBREVIATION_INFLECTION_TESTS = [ ( "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", - ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] + ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"], ), - ( - "ALV:n osuus on 24 %.", - ["ALV:n", "osuus", "on", "24", "%", "."] - ), - ( - "Hiihtäjä oli kilpailun 14:s.", - ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] - ) + ("ALV:n osuus on 24 %.", ["ALV:n", "osuus", "on", "24", "%", "."]), + ("Hiihtäjä oli kilpailun 14:s.", ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]), + ("EU:n toimesta tehtiin jotain.", ["EU:n", "toimesta", "tehtiin", "jotain", "."]), ] From ddf63b97a89ab861d3d73fb3051c04e32136254b Mon Sep 17 00:00:00 2001 From: Tom Keefe <8655118+MisterKeefe@users.noreply.github.com> Date: Sat, 22 Feb 2020 13:13:06 +0000 Subject: [PATCH 17/17] make idx available via to_array (#5030) --- .github/contributors/MisterKeefe.md | 106 ++++++++++++++++++++++++++++ spacy/attrs.pxd | 2 + spacy/attrs.pyx | 1 + spacy/symbols.pxd | 2 + spacy/symbols.pyx | 1 + spacy/tests/doc/test_array.py | 11 +++ spacy/tokens/doc.pyx | 4 +- 7 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/MisterKeefe.md diff --git a/.github/contributors/MisterKeefe.md b/.github/contributors/MisterKeefe.md new file mode 100644 index 000000000..5216fc179 --- /dev/null +++ b/.github/contributors/MisterKeefe.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Tom Keefe | +| Company name (if applicable) | / | +| Title or role (if applicable) | / | +| Date | 18 February 2020 | +| GitHub username | MisterKeefe | +| Website (optional) | / | diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 4cff4a415..4638fcb82 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -92,3 +92,5 @@ cdef enum attr_id_t: LANG ENT_KB_ID = symbols.ENT_KB_ID ENT_ID = symbols.ENT_ID + + IDX diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 51eb5c35b..f14cd6ddc 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -91,6 +91,7 @@ IDS = { "SPACY": SPACY, "PROB": PROB, "LANG": LANG, + "IDX": IDX } diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b6391af11..b24891fdd 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -463,3 +463,5 @@ cdef enum symbol_t: ENT_KB_ID ENT_ID + + IDX \ No newline at end of file diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index d82cf036d..e438caba5 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -93,6 +93,7 @@ IDS = { "SPACY": SPACY, "PROB": PROB, "LANG": LANG, + "IDX": IDX, "ADJ": ADJ, "ADP": ADP, diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 7b513cfab..aa0d37eca 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -66,3 +66,14 @@ def test_doc_array_to_from_string_attrs(en_vocab, attrs): words = ["An", "example", "sentence"] doc = Doc(en_vocab, words=words) Doc(en_vocab, words=words).from_array(attrs, doc.to_array(attrs)) + + +def test_doc_array_idx(en_vocab): + """Test that Doc.to_array can retrieve token start indices""" + words = ["An", "example", "sentence"] + doc = Doc(en_vocab, words=words) + offsets = Doc(en_vocab, words=words).to_array("IDX") + + assert offsets[0] == 0 + assert offsets[1] == 3 + assert offsets[2] == 11 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 04e02fd98..63495ec86 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -23,7 +23,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB -from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, attr_id_t +from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS @@ -73,6 +73,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return token.ent_id elif feat_name == ENT_KB_ID: return token.ent_kb_id + elif feat_name == IDX: + return token.idx else: return Lexeme.get_struct_attr(token.lex, feat_name)