mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-30 22:03:41 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
f0bcc0bd8d
|
@ -10,27 +10,28 @@ _html = {}
|
||||||
IS_JUPYTER = is_in_jupyter()
|
IS_JUPYTER = is_in_jupyter()
|
||||||
|
|
||||||
|
|
||||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}):
|
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
|
options={}, manual=False):
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
jupyter (bool): Experimental, use Jupyter's display() to output markup.
|
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
|
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
factories = {'dep': (DependencyRenderer, parse_deps),
|
||||||
docs = [docs]
|
'ent': (EntityRenderer, parse_ents)}
|
||||||
if style == 'dep':
|
if style not in factories:
|
||||||
renderer = DependencyRenderer(options=options)
|
|
||||||
parsed = [parse_deps(doc, options) for doc in docs]
|
|
||||||
elif style == 'ent':
|
|
||||||
renderer = EntityRenderer(options=options)
|
|
||||||
parsed = [parse_ents(doc, options) for doc in docs]
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown style: %s" % style)
|
raise ValueError("Unknown style: %s" % style)
|
||||||
|
if isinstance(docs, Doc) or isinstance(docs, dict):
|
||||||
|
docs = [docs]
|
||||||
|
renderer, converter = factories[style]
|
||||||
|
renderer = renderer(options=options)
|
||||||
|
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||||
html = _html['parsed']
|
html = _html['parsed']
|
||||||
if jupyter: # return HTML rendered by IPython display()
|
if jupyter: # return HTML rendered by IPython display()
|
||||||
|
@ -39,7 +40,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
|
def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
|
port=5000):
|
||||||
"""Serve displaCy visualisation.
|
"""Serve displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
@ -47,10 +49,11 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
|
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
render(docs, style=style, page=page, minify=minify, options=options)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||||
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
|
|
|
@ -175,7 +175,7 @@ class EntityRenderer(object):
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
|
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
|
||||||
if page:
|
if page:
|
||||||
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||||
markup = TPL_PAGE.format(content=docs)
|
markup = TPL_PAGE.format(content=docs)
|
||||||
|
|
|
@ -54,6 +54,15 @@ p
|
||||||
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
|
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
|
||||||
+cell #[code {}]
|
+cell #[code {}]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code manual]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| Don't parse #[code Doc] and instead, expect a dict or list of
|
||||||
|
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
|
||||||
|
| for formats and examples.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code port]
|
+cell #[code port]
|
||||||
+cell int
|
+cell int
|
||||||
|
@ -111,6 +120,15 @@ p Render a dependency parse tree or named entity visualization.
|
||||||
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
|
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
|
||||||
+cell #[code {}]
|
+cell #[code {}]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code manual]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| Don't parse #[code Doc] and instead, expect a dict or list of
|
||||||
|
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
|
||||||
|
| for formats and examples.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell unicode
|
+cell unicode
|
||||||
|
|
|
@ -101,6 +101,9 @@ p
|
||||||
| directory. You can then use #[code spacy.load()] to load it via its
|
| directory. You can then use #[code spacy.load()] to load it via its
|
||||||
| package name, create a #[+a("#usage-link") shortcut link] to assign it a
|
| package name, create a #[+a("#usage-link") shortcut link] to assign it a
|
||||||
| custom name, or #[+a("usage-import") import it] explicitly as a module.
|
| custom name, or #[+a("usage-import") import it] explicitly as a module.
|
||||||
|
| If you need to download models as part of an automated process, we
|
||||||
|
| recommend using pip with a direct link, instead of relying on spaCy's
|
||||||
|
| #[+api("cli#download") #[code download]] command.
|
||||||
|
|
||||||
+h(3, "download-manual") Manual download and installation
|
+h(3, "download-manual") Manual download and installation
|
||||||
|
|
||||||
|
@ -162,6 +165,14 @@ p
|
||||||
| The #[+api("cli#link") #[code link]] command will create a symlink
|
| The #[+api("cli#link") #[code link]] command will create a symlink
|
||||||
| in the #[code spacy/data] directory.
|
| in the #[code spacy/data] directory.
|
||||||
|
|
||||||
|
+aside("Why does spaCy use symlinks?")
|
||||||
|
| Symlinks were originally introduced to maintain backwards compatibility,
|
||||||
|
| as older versions expected model data to live within #[code spacy/data].
|
||||||
|
| However, we decided to keep using them in v2.0 instead of opting for
|
||||||
|
| a config file. There'll always be a need for assigning and saving custom
|
||||||
|
| model names or IDs. And your system already comes with a native solution
|
||||||
|
| to mapping unicode aliases to file paths: symbolic links.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy link [package name or path] [shortcut] [--force]
|
python -m spacy link [package name or path] [shortcut] [--force]
|
||||||
|
|
||||||
|
@ -179,7 +190,7 @@ p
|
||||||
python -m spacy link /Users/you/model my_amazing_model
|
python -m spacy link /Users/you/model my_amazing_model
|
||||||
|
|
||||||
+infobox("Important note")
|
+infobox("Important note")
|
||||||
| In order to create a symlink, your user needs the required permissions.
|
| In order to create a symlink, your user needs the #[strong required permissions].
|
||||||
| If you've installed spaCy to a system directory and don't have admin
|
| If you've installed spaCy to a system directory and don't have admin
|
||||||
| privileges, the #[code spacy link] command may fail. The easiest solution
|
| privileges, the #[code spacy link] command may fail. The easiest solution
|
||||||
| is to re-run the command as admin, or use a #[code virtualenv]. For more
|
| is to re-run the command as admin, or use a #[code virtualenv]. For more
|
||||||
|
@ -189,16 +200,26 @@ p
|
||||||
+h(3, "usage-import") Importing models as modules
|
+h(3, "usage-import") Importing models as modules
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you've installed a model via pip, you can also #[code import] it
|
| If you've installed a model via spaCy's downloader, or directly via pip,
|
||||||
| directly and then call its #[code load()] method with no arguments:
|
| you can also #[code import] it and then call its #[code load()] method
|
||||||
|
| with no arguments:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
|
||||||
import en_core_web_md
|
import en_core_web_md
|
||||||
|
|
||||||
nlp = en_core_web_md.load()
|
nlp = en_core_web_md.load()
|
||||||
doc = nlp(u'This is a sentence.')
|
doc = nlp(u'This is a sentence.')
|
||||||
|
|
||||||
|
p
|
||||||
|
| How you choose to load your models ultimately depends on personal
|
||||||
|
| preference. However, #[strong for larger code bases], we usually recommend
|
||||||
|
| native imports, as this will make it easier to integrate models with your
|
||||||
|
| existing build process, continuous integration workflow and testing
|
||||||
|
| framework. It'll also prevent you from ever trying to load a model that
|
||||||
|
| is not installed, as your code will raise an #[code ImportError]
|
||||||
|
| immediately, instead of failing somewhere down the line when calling
|
||||||
|
| #[code spacy.load()].
|
||||||
|
|
||||||
+h(2, "own-models") Using your own models
|
+h(2, "own-models") Using your own models
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -20,13 +20,13 @@ p
|
||||||
|
|
||||||
+list("numbers")
|
+list("numbers")
|
||||||
+item
|
+item
|
||||||
| A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
|
| A token whose #[strong lowercase form matches "hello"], e.g. "Hello"
|
||||||
| or "HELLO".
|
| or "HELLO".
|
||||||
+item
|
+item
|
||||||
| A token whose #[strong #[code is_punct] flag is set to #[code True]],
|
| A token whose #[strong #[code is_punct] flag is set to #[code True]],
|
||||||
| i.e. any punctuation.
|
| i.e. any punctuation.
|
||||||
+item
|
+item
|
||||||
| A token whose #[strong lower-case form matches "world"], e.g. "World"
|
| A token whose #[strong lowercase form matches "world"], e.g. "World"
|
||||||
| or "WORLD".
|
| or "WORLD".
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
|
@ -95,10 +95,6 @@ p
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
|
|
||||||
matcher.add('GoogleIO', on_match=add_event_ent,
|
|
||||||
[{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
|
|
||||||
[{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])
|
|
||||||
|
|
||||||
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
|
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
|
||||||
EVENT = nlp.vocab.strings['EVENT']
|
EVENT = nlp.vocab.strings['EVENT']
|
||||||
|
|
||||||
|
@ -108,6 +104,10 @@ p
|
||||||
match_id, start, end = matches[i]
|
match_id, start, end = matches[i]
|
||||||
doc.ents += ((EVENT, start, end),)
|
doc.ents += ((EVENT, start, end),)
|
||||||
|
|
||||||
|
matcher.add('GoogleIO', on_match=add_event_ent,
|
||||||
|
[{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
|
||||||
|
[{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])
|
||||||
|
|
||||||
p
|
p
|
||||||
| In addition to mentions of "Google I/O", your data also contains some
|
| In addition to mentions of "Google I/O", your data also contains some
|
||||||
| annoying pre-processing artefacts, like leftover HTML line breaks
|
| annoying pre-processing artefacts, like leftover HTML line breaks
|
||||||
|
@ -117,10 +117,6 @@ p
|
||||||
| function #[code merge_and_flag]:
|
| function #[code merge_and_flag]:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
matcher.add('BAD_HTML', on_match=merge_and_flag,
|
|
||||||
[{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
|
|
||||||
[{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}])
|
|
||||||
|
|
||||||
# Add a new custom flag to the vocab, which is always False by default.
|
# Add a new custom flag to the vocab, which is always False by default.
|
||||||
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
||||||
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
|
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
|
||||||
|
@ -131,6 +127,10 @@ p
|
||||||
span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
|
span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
|
||||||
span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
|
span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
|
||||||
|
|
||||||
|
matcher.add('BAD_HTML', on_match=merge_and_flag,
|
||||||
|
[{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
|
||||||
|
[{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}])
|
||||||
|
|
||||||
+aside("Tip: Visualizing matches")
|
+aside("Tip: Visualizing matches")
|
||||||
| When working with entities, you can use #[+api("displacy") displaCy]
|
| When working with entities, you can use #[+api("displacy") displaCy]
|
||||||
| to quickly generate a NER visualization from your updated #[code Doc],
|
| to quickly generate a NER visualization from your updated #[code Doc],
|
||||||
|
@ -146,18 +146,16 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| We can now call the matcher on our documents. The patterns will be
|
| We can now call the matcher on our documents. The patterns will be
|
||||||
| matched in the order they occur in the text.
|
| matched in the order they occur in the text. The matcher will then
|
||||||
|
| iterate over the matches, look up the callback for the match ID
|
||||||
|
| that was matched, and invoke it.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
doc = nlp(LOTS_OF_TEXT)
|
doc = nlp(LOTS_OF_TEXT)
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
|
|
||||||
+h(3, "on_match-callback") The callback function
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| The matcher will first collect all matches over the document. It will
|
| When the callback is invoked, it is
|
||||||
| then iterate over the matches, lookup the callback for the entity ID
|
|
||||||
| that was matched, and invoke it. When the callback is invoked, it is
|
|
||||||
| passed four arguments: the matcher itself, the document, the position of
|
| passed four arguments: the matcher itself, the document, the position of
|
||||||
| the current match, and the total list of matches. This allows you to
|
| the current match, and the total list of matches. This allows you to
|
||||||
| write callbacks that consider the entire set of matched phrases, so that
|
| write callbacks that consider the entire set of matched phrases, so that
|
||||||
|
@ -185,11 +183,24 @@ p
|
||||||
+cell
|
+cell
|
||||||
| A list of #[code (match_id, start, end)] tuples, describing the
|
| A list of #[code (match_id, start, end)] tuples, describing the
|
||||||
| matches. A match tuple describes a span #[code doc[start:end]].
|
| matches. A match tuple describes a span #[code doc[start:end]].
|
||||||
| The #[code match_id] is the ID of the added match pattern.
|
|
||||||
|
|
||||||
+h(2, "quantifiers") Using quantifiers
|
+h(2, "quantifiers") Using operators and quantifiers
|
||||||
|
|
||||||
+table([ "Name", "Description", "Example"])
|
p
|
||||||
|
| The matcher also lets you use quantifiers, specified as the #[code 'OP']
|
||||||
|
| key. Quantifiers let you define sequences of tokens to be mached, e.g.
|
||||||
|
| one or more punctuation marks, or specify optional tokens. Note that there
|
||||||
|
| are no nested or scoped quantifiers – instead, you can build those
|
||||||
|
| behaviours with #[code on_match] callbacks.
|
||||||
|
|
||||||
|
+aside("Problems with quantifiers")
|
||||||
|
| Using quantifiers may lead to unexpected results when matching
|
||||||
|
| variable-length patterns, for example if the next token would also be
|
||||||
|
| matched by the previous token. This problem should be resolved in a future
|
||||||
|
| release. For more information, see
|
||||||
|
| #[+a(gh("spaCy") + "/issues/864") this issue].
|
||||||
|
|
||||||
|
+table([ "OP", "Description", "Example"])
|
||||||
+row
|
+row
|
||||||
+cell #[code !]
|
+cell #[code !]
|
||||||
+cell match exactly 0 times
|
+cell match exactly 0 times
|
||||||
|
@ -210,6 +221,103 @@ p
|
||||||
+cell match 0 or 1 times
|
+cell match 0 or 1 times
|
||||||
+cell optional, max one
|
+cell optional, max one
|
||||||
|
|
||||||
|
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
|
||||||
|
|
||||||
p
|
p
|
||||||
| There are no nested or scoped quantifiers. You can build those
|
| Let's say you're analysing user comments and you want to find out what
|
||||||
| behaviours with #[code on_match] callbacks.
|
| people are saying about Facebook. You want to start off by finding
|
||||||
|
| adjectives following "Facebook is" or "Facebook was". This is obviously
|
||||||
|
| a very rudimentary solution, but it'll be fast, and a great way get an
|
||||||
|
| idea for what's in your data. Your pattern could look like this:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
[{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'}, {'POS': 'ADJ'}]
|
||||||
|
|
||||||
|
p
|
||||||
|
| This translates to a token whose lowercase form matches "facebook"
|
||||||
|
| (like Facebook, facebook or FACEBOOK), followed by a token with the lemma
|
||||||
|
| "be" (for example, is, was, or 's), followed by an #[strong optional] adverb,
|
||||||
|
| followed by an adjective. Using the linguistic annotations here is
|
||||||
|
| especially useful, because you can tell spaCy to match "Facebook's
|
||||||
|
| annoying", but #[strong not] "Facebook's annoying ads". The optional
|
||||||
|
| adverb makes sure you won't miss adjectives with intensifiers, like
|
||||||
|
| "pretty awful" or "very nice".
|
||||||
|
|
||||||
|
p
|
||||||
|
| To get a quick overview of the results, you could collect all sentences
|
||||||
|
| containing a match and render them with the
|
||||||
|
| #[+a("/docs/usage/visualizers") displaCy visualizer].
|
||||||
|
| In the callback function, you'll have access to the #[code start] and
|
||||||
|
| #[code end] of each match, as well as the parent #[code Doc]. This lets
|
||||||
|
| you determine the sentence containing the match,
|
||||||
|
| #[code doc[start : end].sent], and calculate the start and end of the
|
||||||
|
| matched span within the sentence. Using displaCy in
|
||||||
|
| #[+a("/docs/usage/visualizers#manual-usage") "manual" mode] lets you
|
||||||
|
| pass in a list of dictionaries containing the text and entities to render.
|
||||||
|
|
||||||
|
+code.
|
||||||
|
from spacy import displacy
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
matched_sents = [] # collect data of matched sentences to be visualized
|
||||||
|
|
||||||
|
def collect_sents(matcher, doc, i, matches):
|
||||||
|
match_id, start, end = matches[i]
|
||||||
|
span = doc[start : end] # matched span
|
||||||
|
sent = span.sent # sentence containing matched span
|
||||||
|
# append mock entity for match in displaCy style to matched_sents
|
||||||
|
# get the match span by ofsetting the start and end of the span with the
|
||||||
|
# start and end of the sentence in the doc
|
||||||
|
match_ents = [{'start': span.start-sent.start, 'end': span.end-sent.start,
|
||||||
|
'label': 'MATCH'}]
|
||||||
|
matched_sents.append({'text': sent.text, 'ents': match_ents })
|
||||||
|
|
||||||
|
pattern = [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'},
|
||||||
|
{'POS': 'ADJ'}]
|
||||||
|
matcher.add('FacebookIs', collect_sents, pattern) # add pattern
|
||||||
|
matches = matcher(nlp(LOTS_OF_TEXT)) # match on your text
|
||||||
|
|
||||||
|
# serve visualization of sentences containing match with displaCy
|
||||||
|
# set manual=True to make displaCy render straight from a dictionary
|
||||||
|
displacy.serve(matched_sents, style='ent', manual=True)
|
||||||
|
|
||||||
|
|
||||||
|
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
|
||||||
|
|
||||||
|
p
|
||||||
|
| Phone numbers can have many different formats and matching them is often
|
||||||
|
| tricky. During tokenization, spaCy will leave sequences of numbers intact
|
||||||
|
| and only split on whitespace and punctuation. This means that your match
|
||||||
|
| pattern will have to look out for number sequences of a certain length,
|
||||||
|
| surrounded by specific punctuation – depending on the
|
||||||
|
| #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers") national conventions].
|
||||||
|
|
||||||
|
p
|
||||||
|
| The #[code IS_DIGIT] flag is not very helpful here, because it doesn't
|
||||||
|
| tell us anything about the length. However, you can use the #[code SHAPE]
|
||||||
|
| flag, with each #[code d] representing a digit:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
[{'ORTH': '('}, {'SHAPE': 'ddd'}, {'ORTH': ')'}, {'SHAPE': 'dddd'},
|
||||||
|
{'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]
|
||||||
|
|
||||||
|
p
|
||||||
|
| This will match phone numbers of the format #[strong (123) 4567 8901] or
|
||||||
|
| #[strong (123) 4567-8901]. To also match formats like #[strong (123) 456 789],
|
||||||
|
| you can add a second pattern using #[code 'ddd'] in place of #[code 'dddd'].
|
||||||
|
| By hard-coding some values, you can match only certain, country-specific
|
||||||
|
| numbers. For example, here's a pattern to match the most common formats of
|
||||||
|
| #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Germany") international German numbers]:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
[{'ORTH': '+'}, {'ORTH': '49'}, {'ORTH': '(', 'OP': '?'}, {'SHAPE': 'dddd'},
|
||||||
|
{'ORTH': ')', 'OP': '?'}, {'SHAPE': 'dddddd'}]
|
||||||
|
|
||||||
|
p
|
||||||
|
| Depending on the formats your application needs to match, creating an
|
||||||
|
| extensive set of rules like this is often better than training a model.
|
||||||
|
| It'll produce more predictable results, is much easier to modify and
|
||||||
|
| extend, and doesn't require any training data – only a set of
|
||||||
|
| test cases.
|
||||||
|
|
|
@ -287,24 +287,17 @@ p
|
||||||
| #[+a("http://www.nltk.org") NLTK] or
|
| #[+a("http://www.nltk.org") NLTK] or
|
||||||
| #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet].
|
| #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet].
|
||||||
| Simply convert the dependency parse or recognised entities to displaCy's
|
| Simply convert the dependency parse or recognised entities to displaCy's
|
||||||
| format and import #[code DependencyRenderer] or #[code EntityRenderer]
|
| format and set #[code manual=True] on either #[code render()] or
|
||||||
| from #[code spacy.displacy.render]. A renderer class can be is initialised
|
| #[code serve()].
|
||||||
| with a dictionary of options. To generate the visualization markup, call
|
|
||||||
| the renderer's #[code render()] method on a list of dictionaries (one
|
|
||||||
| per visualization).
|
|
||||||
|
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
from spacy.displacy.render import EntityRenderer
|
|
||||||
|
|
||||||
ex = [{'text': 'But Google is starting from behind.',
|
ex = [{'text': 'But Google is starting from behind.',
|
||||||
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
|
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
|
||||||
'title': None}]
|
'title': None}]
|
||||||
renderer = EntityRenderer()
|
html = displacy.render(ex, style='ent', manual=True)
|
||||||
html = renderer.render(ex)
|
|
||||||
|
|
||||||
+code("DependencyRenderer input").
|
+code("DEP input").
|
||||||
[{
|
{
|
||||||
'words': [
|
'words': [
|
||||||
{'text': 'This', 'tag': 'DT'},
|
{'text': 'This', 'tag': 'DT'},
|
||||||
{'text': 'is', 'tag': 'VBZ'},
|
{'text': 'is', 'tag': 'VBZ'},
|
||||||
|
@ -314,11 +307,10 @@ p
|
||||||
{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
|
{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
|
||||||
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
|
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
|
||||||
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
|
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
|
||||||
}]
|
}
|
||||||
|
|
||||||
+code("EntityRenderer input").
|
+code("ENT input").
|
||||||
[{
|
{
|
||||||
'text': 'But Google is starting from behind.',
|
'text': 'But Google is starting from behind.',
|
||||||
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
|
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
|
||||||
'title': None
|
'title': None
|
||||||
}]
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user