mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 05:33:15 +03:00
Merge pull request #1397 from explosion/feature/matcher-wildcard-token
💫 Allow empty dictionaries to match any token in Matcher
This commit is contained in:
commit
36c68015f3
|
@ -17,7 +17,7 @@ from libcpp.pair cimport pair
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
from .attrs cimport ID, ENT_TYPE
|
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from .tokens.doc cimport get_token_attr
|
from .tokens.doc cimport get_token_attr
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
@ -142,6 +142,10 @@ def _convert_strings(token_specs, string_store):
|
||||||
tokens = []
|
tokens = []
|
||||||
op = ONE
|
op = ONE
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
|
if not spec:
|
||||||
|
# Signifier for 'any token'
|
||||||
|
tokens.append((ONE, [(NULL_ATTR, 0)]))
|
||||||
|
continue
|
||||||
token = []
|
token = []
|
||||||
ops = (ONE,)
|
ops = (ONE,)
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
|
@ -295,7 +299,7 @@ cdef class Matcher:
|
||||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||||
|
|
||||||
doc (Doc): The document to match over.
|
doc (Doc): The document to match over.
|
||||||
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -98,6 +98,20 @@ def test_matcher_match_multi(matcher):
|
||||||
(doc.vocab.strings['Java'], 5, 6)]
|
(doc.vocab.strings['Java'], 5, 6)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_empty_dict(en_vocab):
|
||||||
|
'''Test matcher allows empty token specs, meaning match on any token.'''
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
abc = ["a", "b", "c"]
|
||||||
|
doc = get_doc(matcher.vocab, abc)
|
||||||
|
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
assert matches[0][1:] == (0, 3)
|
||||||
|
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][1:] == (0, 2)
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
words = ["Google", "Now"]
|
words = ["Google", "Now"]
|
||||||
doc = get_doc(en_vocab, words)
|
doc = get_doc(en_vocab, words)
|
||||||
|
|
|
@ -464,6 +464,13 @@ p The L2 norm of the token's vector representation.
|
||||||
| Is the token in lowercase? Equivalent to
|
| Is the token in lowercase? Equivalent to
|
||||||
| #[code token.text.islower()].
|
| #[code token.text.islower()].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_upper]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| Is the token in uppercase? Equivalent to
|
||||||
|
| #[code token.text.isupper()].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_title]
|
+cell #[code is_title]
|
||||||
+cell bool
|
+cell bool
|
||||||
|
|
|
@ -75,6 +75,131 @@ p
|
||||||
| other pattern types. You shouldn't have to create different matchers for
|
| other pattern types. You shouldn't have to create different matchers for
|
||||||
| each of those processes.
|
| each of those processes.
|
||||||
|
|
||||||
|
+h(4, "adding-patterns-attributes") Available token attributes
|
||||||
|
|
||||||
|
p
|
||||||
|
| The available token pattern keys are uppercase versions of the
|
||||||
|
| #[+api("token#attributes") #[code Token] attributes]. The most relevant
|
||||||
|
| ones for rule-based matching are:
|
||||||
|
|
||||||
|
+table(["Attribute", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code ORTH]
|
||||||
|
+cell The exact verbatim text of a token.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap #[code LOWER], #[code UPPER]
|
||||||
|
+cell The lowercase, uppercase form of the token text.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap #[code IS_ALPHA], #[code IS_ASCII], #[code IS_DIGIT]
|
||||||
|
+cell
|
||||||
|
| Token text consists of alphanumeric characters, ASCII characters,
|
||||||
|
| digits.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap #[code IS_LOWER], #[code IS_UPPER], #[code IS_TITLE]
|
||||||
|
+cell Token text is in lowercase, uppercase, titlecase.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap #[code IS_PUNCT], #[code IS_SPACE], #[code IS_STOP]
|
||||||
|
+cell Token is punctuation, whitespace, stop word.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap #[code LIKE_NUM], #[code LIKE_URL], #[code LIKE_EMAIL]
|
||||||
|
+cell Token text resembles a number, URL, email.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap
|
||||||
|
| #[code POS], #[code TAG], #[code DEP], #[code LEMMA],
|
||||||
|
| #[code SHAPE]
|
||||||
|
+cell
|
||||||
|
| The token's simple and extended part-of-speech tag, dependency
|
||||||
|
| label, lemma, shape.
|
||||||
|
|
||||||
|
+h(4, "adding-patterns-wildcard") Using wildcard token patterns
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| While the token attributes offer many options to write highly specific
|
||||||
|
| patterns, you can also use an empty dictionary, #[code {}] as a wildcard
|
||||||
|
| representing #[strong any token]. This is useful if you know the context
|
||||||
|
| of what you're trying to match, but very little about the specific token
|
||||||
|
| and its characters. For example, let's say you're trying to extract
|
||||||
|
| people's user names from your data. All you know is that they are listed
|
||||||
|
| as "User name: {username}". The name itself may contain any character,
|
||||||
|
| but no whitespace – so you'll know it will be handled as one token.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
[{'ORTH': 'User'}, {'ORTH': 'name'}, {'ORTH': ':'}, {}]
|
||||||
|
|
||||||
|
+h(4, "quantifiers") Using operators and quantifiers
|
||||||
|
|
||||||
|
p
|
||||||
|
| The matcher also lets you use quantifiers, specified as the #[code 'OP']
|
||||||
|
| key. Quantifiers let you define sequences of tokens to be mached, e.g.
|
||||||
|
| one or more punctuation marks, or specify optional tokens. Note that there
|
||||||
|
| are no nested or scoped quantifiers – instead, you can build those
|
||||||
|
| behaviours with #[code on_match] callbacks.
|
||||||
|
|
||||||
|
+aside("Problems with quantifiers")
|
||||||
|
| Using quantifiers may lead to unexpected results when matching
|
||||||
|
| variable-length patterns, for example if the next token would also be
|
||||||
|
| matched by the previous token. This problem should be resolved in a future
|
||||||
|
| release. For more information, see
|
||||||
|
| #[+a(gh("spaCy") + "/issues/864") this issue].
|
||||||
|
|
||||||
|
+table([ "OP", "Description", "Example"])
|
||||||
|
+row
|
||||||
|
+cell #[code !]
|
||||||
|
+cell match exactly 0 times
|
||||||
|
+cell negation
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code *]
|
||||||
|
+cell match 0 or more times
|
||||||
|
+cell optional, variable number
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code +]
|
||||||
|
+cell match 1 or more times
|
||||||
|
+cell mandatory, variable number
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code ?]
|
||||||
|
+cell match 0 or 1 times
|
||||||
|
+cell optional, max one
|
||||||
|
|
||||||
|
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
||||||
|
|
||||||
|
p
|
||||||
|
| If you need to match large terminology lists, you can also use the
|
||||||
|
| #[+api("phrasematcher") #[code PhraseMatcher]] and create
|
||||||
|
| #[+api("doc") #[code Doc]] objects instead of token patterns, which is
|
||||||
|
| much more efficient overall. The #[code Doc] patterns can contain single
|
||||||
|
| or multiple tokens.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
import spacy
|
||||||
|
from spacy.matcher import PhraseMatcher
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
matcher = PhraseMatcher(nlp.vocab)
|
||||||
|
terminology_list = ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
|
||||||
|
patterns = [nlp(text) for text in terminology_list]
|
||||||
|
matcher.add('TerminologyList', None, *patterns)
|
||||||
|
|
||||||
|
doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
|
||||||
|
u"converse in the Oval Office inside the White House in Washington, D.C.")
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Since spaCy is used for processing both the patterns and the text to be
|
||||||
|
| matched, you won't have to worry about specific tokenization – for
|
||||||
|
| example, you can simply pass in #[code nlp(u"Washington, D.C.")] and
|
||||||
|
| won't have to write a complex token pattern covering the exact
|
||||||
|
| tokenization of the term.
|
||||||
|
|
||||||
+h(3, "on_match") Adding #[code on_match] rules
|
+h(3, "on_match") Adding #[code on_match] rules
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -183,43 +308,6 @@ p
|
||||||
| A list of #[code (match_id, start, end)] tuples, describing the
|
| A list of #[code (match_id, start, end)] tuples, describing the
|
||||||
| matches. A match tuple describes a span #[code doc[start:end]].
|
| matches. A match tuple describes a span #[code doc[start:end]].
|
||||||
|
|
||||||
+h(3, "quantifiers") Using operators and quantifiers
|
|
||||||
|
|
||||||
p
|
|
||||||
| The matcher also lets you use quantifiers, specified as the #[code 'OP']
|
|
||||||
| key. Quantifiers let you define sequences of tokens to be mached, e.g.
|
|
||||||
| one or more punctuation marks, or specify optional tokens. Note that there
|
|
||||||
| are no nested or scoped quantifiers – instead, you can build those
|
|
||||||
| behaviours with #[code on_match] callbacks.
|
|
||||||
|
|
||||||
+aside("Problems with quantifiers")
|
|
||||||
| Using quantifiers may lead to unexpected results when matching
|
|
||||||
| variable-length patterns, for example if the next token would also be
|
|
||||||
| matched by the previous token. This problem should be resolved in a future
|
|
||||||
| release. For more information, see
|
|
||||||
| #[+a(gh("spaCy") + "/issues/864") this issue].
|
|
||||||
|
|
||||||
+table([ "OP", "Description", "Example"])
|
|
||||||
+row
|
|
||||||
+cell #[code !]
|
|
||||||
+cell match exactly 0 times
|
|
||||||
+cell negation
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code *]
|
|
||||||
+cell match 0 or more times
|
|
||||||
+cell optional, variable number
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code +]
|
|
||||||
+cell match 1 or more times
|
|
||||||
+cell mandatory, variable number
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code ?]
|
|
||||||
+cell match 0 or 1 times
|
|
||||||
+cell optional, max one
|
|
||||||
|
|
||||||
+h(3, "example1") Example: Using linguistic annotations
|
+h(3, "example1") Example: Using linguistic annotations
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
Loading…
Reference in New Issue
Block a user