diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 83281543a..ee1be57a3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -67,10 +67,7 @@ def evaluate( corpus = Corpus(data_path, data_path) nlp = util.load_model(model) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) - begin = timer() scores = nlp.evaluate(dev_dataset, verbose=False) - end = timer() - nwords = sum(len(ex.predicted) for ex in dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", @@ -82,17 +79,21 @@ def evaluate( "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", - "Textcat": "cats_score", - "Sent P": "sents_p", - "Sent R": "sents_r", - "Sent F": "sents_f", + "TEXTCAT": "cats_score", + "SENT P": "sents_p", + "SENT R": "sents_r", + "SENT F": "sents_f", + "SPEED": "speed", } results = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - results[metric] = f"{scores[key]*100:.2f}" + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} msg.table(results, title="Results") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 44597c73e..fbe3a5013 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,5 +1,4 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer import srsly import tqdm from pathlib import Path @@ -248,14 +247,11 @@ def create_evaluation_callback( dev_examples = list(dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples) batch_size = cfg["eval_batch_size"] - start_time = timer() if optimizer.averages: with nlp.use_params(optimizer.averages): scores = nlp.evaluate(dev_examples, batch_size=batch_size) else: scores = nlp.evaluate(dev_examples, batch_size=batch_size) - end_time = timer() - wps = n_words / (end_time - start_time) # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] try: @@ -264,7 +260,6 @@ def create_evaluation_callback( keys = list(scores.keys()) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) raise KeyError(err) - scores["speed"] = wps return weighted_score, scores return evaluate @@ -446,7 +441,7 @@ def update_meta( training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] ) -> None: nlp.meta["performance"] = {} - for metric in training["scores_weights"]: + for metric in training["score_weights"]: nlp.meta["performance"][metric] = info["other_scores"][metric] for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/errors.py b/spacy/errors.py index a10e5d9bd..3fe53d6db 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -432,12 +432,12 @@ class Errors: "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") E170 = ("Cannot apply transition {name}: invalid for the current state.") - E171 = ("Matcher.add received invalid on_match callback argument: expected " + E171 = ("Matcher.add received invalid 'on_match' callback argument: expected " "callable or None, but got: {arg_type}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") - E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you " + E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you " "accidentally passed a single pattern to Matcher.add instead of a " "list of patterns? If you only want to add one pattern, make sure " "to wrap it in a list. For example: matcher.add('{key}', [pattern])") @@ -483,6 +483,10 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E947 = ("Matcher.add received invalid 'greedy' argument: expected " + "a string value from {expected} but got: '{arg}'") + E948 = ("Matcher.add received invalid 'patterns' argument: expected " + "a List, but got: {arg_type}") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") diff --git a/spacy/language.py b/spacy/language.py index a75295ca5..9dd8a347e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,6 +14,7 @@ from thinc.api import get_current_ops, Config, require_gpu, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle +from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab @@ -1130,7 +1131,14 @@ class Language: kwargs.setdefault("verbose", verbose) kwargs.setdefault("nlp", self) scorer = Scorer(**kwargs) - docs = list(eg.predicted for eg in examples) + texts = [eg.reference.text for eg in examples] + docs = [eg.predicted for eg in examples] + start_time = timer() + # tokenize the texts only for timing purposes + if not hasattr(self.tokenizer, "pipe"): + _ = [self.tokenizer(text) for text in texts] + else: + _ = list(self.tokenizer.pipe(texts)) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) @@ -1138,11 +1146,18 @@ class Language: docs = _pipe(docs, pipe, kwargs) else: docs = pipe.pipe(docs, **kwargs) + # iterate over the final generator + if len(self.pipeline): + docs = list(docs) + end_time = timer() for i, (doc, eg) in enumerate(zip(docs, examples)): if verbose: print(doc) eg.predicted = doc - return scorer.score(examples) + results = scorer.score(examples) + n_words = sum(len(eg.predicted) for eg in examples) + results["speed"] = n_words / (end_time - start_time) + return results @contextmanager def use_params(self, params: dict): diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 689734079..e1f6bc773 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -66,6 +66,7 @@ cdef class Matcher: cdef public object validate cdef public object _patterns cdef public object _callbacks + cdef public object _filter cdef public object _extensions cdef public object _extra_predicates cdef public object _seen_attrs diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 706cfdd68..325c81369 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,6 +1,9 @@ # cython: infer_types=True, cython: profile=True +from typing import List + from libcpp.vector cimport vector from libc.stdint cimport int32_t +from libc.string cimport memset, memcmp from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 @@ -41,6 +44,7 @@ cdef class Matcher: self._extra_predicates = [] self._patterns = {} self._callbacks = {} + self._filter = {} self._extensions = {} self._seen_attrs = set() self.vocab = vocab @@ -68,7 +72,7 @@ cdef class Matcher: """ return self._normalize_key(key) in self._patterns - def add(self, key, patterns, *_patterns, on_match=None): + def add(self, key, patterns, *, on_match=None, greedy: str=None): """Add a match-rule to the matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. @@ -86,11 +90,10 @@ cdef class Matcher: '+': Require the pattern to match 1 or more times. '*': Allow the pattern to zero or more times. - The + and * operators are usually interpretted "greedily", i.e. longer - matches are returned where possible. However, if you specify two '+' - and '*' patterns in a row and their matches overlap, the first - operator will behave non-greedily. This quirk in the semantics makes - the matcher more efficient, by avoiding the need for back-tracking. + The + and * operators return all possible matches (not just the greedy + ones). However, the "greedy" argument can filter the final matches + by returning a non-overlapping set per key, either taking preference to + the first greedy match ("FIRST"), or the longest ("LONGEST"). As of spaCy v2.2.2, Matcher.add supports the future API, which makes the patterns the second argument and a list (instead of a variable @@ -100,16 +103,15 @@ cdef class Matcher: key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. - *_patterns (list): For backwards compatibility: list of patterns to add - as variable arguments. Will be ignored if a list of patterns is - provided as the second argument. + greedy (str): Optional filter: "FIRST" or "LONGEST". """ errors = {} if on_match is not None and not hasattr(on_match, "__call__"): raise ValueError(Errors.E171.format(arg_type=type(on_match))) - if patterns is None or hasattr(patterns, "__call__"): # old API - on_match = patterns - patterns = _patterns + if patterns is None or not isinstance(patterns, List): # old API + raise ValueError(Errors.E948.format(arg_type=type(patterns))) + if greedy is not None and greedy not in ["FIRST", "LONGEST"]: + raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy)) for i, pattern in enumerate(patterns): if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) @@ -132,6 +134,7 @@ cdef class Matcher: raise ValueError(Errors.E154.format()) self._patterns.setdefault(key, []) self._callbacks[key] = on_match + self._filter[key] = greedy self._patterns[key].extend(patterns) def remove(self, key): @@ -217,6 +220,7 @@ cdef class Matcher: length = doclike.end - doclike.start else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) + cdef Pool tmp_pool = Pool() if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) @@ -224,11 +228,42 @@ cdef class Matcher: raise ValueError(Errors.E156.format()) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) - for i, (key, start, end) in enumerate(matches): + final_matches = [] + pairs_by_id = {} + # For each key, either add all matches, or only the filtered, non-overlapping ones + for (key, start, end) in matches: + span_filter = self._filter.get(key) + if span_filter is not None: + pairs = pairs_by_id.get(key, []) + pairs.append((start,end)) + pairs_by_id[key] = pairs + else: + final_matches.append((key, start, end)) + matched = tmp_pool.alloc(length, sizeof(char)) + empty = tmp_pool.alloc(length, sizeof(char)) + for key, pairs in pairs_by_id.items(): + memset(matched, 0, length * sizeof(matched[0])) + span_filter = self._filter.get(key) + if span_filter == "FIRST": + sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start + elif span_filter == "LONGEST": + sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length + else: + raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter)) + for (start, end) in sorted_pairs: + assert 0 <= start < end # Defend against segfaults + span_len = end-start + # If no tokens in the span have matched + if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0: + final_matches.append((key, start, end)) + # Mark tokens that have matched + memset(&matched[start], 1, span_len * sizeof(matched[0])) + # perform the callbacks on the filtered set of results + for i, (key, start, end) in enumerate(final_matches): on_match = self._callbacks.get(key, None) if on_match is not None: - on_match(self, doc, i, matches) - return matches + on_match(self, doc, i, final_matches) + return final_matches def _normalize_key(self, key): if isinstance(key, basestring): @@ -239,9 +274,9 @@ cdef class Matcher: def unpickle_matcher(vocab, patterns, callbacks): matcher = Matcher(vocab) - for key, specs in patterns.items(): + for key, pattern in patterns.items(): callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) + matcher.add(key, pattern, on_match=callback) return matcher diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 8a6a5188f..501884873 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -58,7 +58,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: """ # TODO: make stateful component with "label" config merger = Matcher(doc.vocab) - merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) + merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]]) matches = merger(doc) spans = filter_spans([doc[start : end + 1] for _, start, end in matches]) with doc.retokenize() as retokenizer: diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 98542e80f..bcb224bd3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -63,18 +63,11 @@ def test_matcher_len_contains(matcher): assert "TEST2" not in matcher -def test_matcher_add_new_old_api(en_vocab): +def test_matcher_add_new_api(en_vocab): doc = Doc(en_vocab, words=["a", "b"]) patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]] matcher = Matcher(en_vocab) - matcher.add("OLD_API", None, *patterns) - assert len(matcher(doc)) == 2 - matcher = Matcher(en_vocab) on_match = Mock() - matcher.add("OLD_API_CALLBACK", on_match, *patterns) - assert len(matcher(doc)) == 2 - assert on_match.call_count == 2 - # New API: add(key: str, patterns: List[List[dict]], on_match: Callable) matcher = Matcher(en_vocab) matcher.add("NEW_API", patterns) assert len(matcher(doc)) == 2 @@ -176,7 +169,7 @@ def test_matcher_match_zero_plus(matcher): def test_matcher_match_one_plus(matcher): control = Matcher(matcher.vocab) - control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}]) + control.add("BasicPhilippe", [[{"ORTH": "Philippe"}]]) doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index a2b2cd83f..8f4c13471 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -7,18 +7,10 @@ from spacy.tokens import Doc, Span pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] -pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}] +pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}] pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] -pattern4 = [ - {"ORTH": "B"}, - {"ORTH": "A", "OP": "*"}, - {"ORTH": "B"}, -] -pattern5 = [ - {"ORTH": "B", "OP": "*"}, - {"ORTH": "A", "OP": "*"}, - {"ORTH": "B"}, -] +pattern4 = [{"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] +pattern5 = [{"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] re_pattern1 = "AA*" re_pattern2 = "A*A" @@ -26,10 +18,16 @@ re_pattern3 = "AA" re_pattern4 = "BA*B" re_pattern5 = "B*A*B" +longest1 = "A A A A A" +longest2 = "A A A A A" +longest3 = "A A" +longest4 = "B A A A A A B" # "FIRST" would be "B B" +longest5 = "B B A A A A A B" + @pytest.fixture def text(): - return "(ABBAAAAAB)." + return "(BBAAAAAB)." @pytest.fixture @@ -41,25 +39,63 @@ def doc(en_tokenizer, text): @pytest.mark.parametrize( "pattern,re_pattern", [ - pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()), - pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()), - pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()), + (pattern1, re_pattern1), + (pattern2, re_pattern2), + (pattern3, re_pattern3), (pattern4, re_pattern4), - pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()), + (pattern5, re_pattern5), ], ) -def test_greedy_matching(doc, text, pattern, re_pattern): - """Test that the greedy matching behavior of the * op is consistant with +def test_greedy_matching_first(doc, text, pattern, re_pattern): + """Test that the greedy matching behavior "FIRST" is consistent with other re implementations.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, [pattern]) + matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] - for match, re_match in zip(matches, re_matches): - assert match[1:] == re_match + for (key, m_s, m_e), (re_s, re_e) in zip(matches, re_matches): + # matching the string, not the exact position + assert doc[m_s:m_e].text == doc[re_s:re_e].text + + +@pytest.mark.parametrize( + "pattern,longest", + [ + (pattern1, longest1), + (pattern2, longest2), + (pattern3, longest3), + (pattern4, longest4), + (pattern5, longest5), + ], +) +def test_greedy_matching_longest(doc, text, pattern, longest): + """Test the "LONGEST" greedy matching behavior""" + matcher = Matcher(doc.vocab) + matcher.add("RULE", [pattern], greedy="LONGEST") + matches = matcher(doc) + for (key, s, e) in matches: + assert doc[s:e].text == longest + + +def test_greedy_matching_longest_first(en_tokenizer): + """Test that "LONGEST" matching prefers the first of two equally long matches""" + doc = en_tokenizer(" ".join("CCC")) + matcher = Matcher(doc.vocab) + pattern = [{"ORTH": "C"}, {"ORTH": "C"}] + matcher.add("RULE", [pattern], greedy="LONGEST") + matches = matcher(doc) + # out of 0-2 and 1-3, the first should be picked + assert len(matches) == 1 + assert matches[0][1] == 0 + assert matches[0][2] == 2 + + +def test_invalid_greediness(doc, text): + matcher = Matcher(doc.vocab) + with pytest.raises(ValueError): + matcher.add("RULE", [pattern1], greedy="GREEDY") -@pytest.mark.xfail @pytest.mark.parametrize( "pattern,re_pattern", [ @@ -74,7 +110,7 @@ def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, [pattern]) + matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches)