mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						f0bcc0bd8d
					
				|  | @ -10,27 +10,28 @@ _html = {} | ||||||
| IS_JUPYTER = is_in_jupyter() | IS_JUPYTER = is_in_jupyter() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}): | def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, | ||||||
|  |           options={}, manual=False): | ||||||
|     """Render displaCy visualisation. |     """Render displaCy visualisation. | ||||||
| 
 | 
 | ||||||
|     docs (list or Doc): Document(s) to visualise. |     docs (list or Doc): Document(s) to visualise. | ||||||
|     style (unicode): Visualisation style, 'dep' or 'ent'. |     style (unicode): Visualisation style, 'dep' or 'ent'. | ||||||
|     page (bool): Render markup as full HTML page. |     page (bool): Render markup as full HTML page. | ||||||
|     minify (bool): Minify HTML markup. |     minify (bool): Minify HTML markup. | ||||||
|     jupyter (bool): Experimental, use Jupyter's display() to output markup. |     jupyter (bool): Experimental, use Jupyter's `display()` to output markup. | ||||||
|     options (dict): Visualiser-specific options, e.g. colors. |     options (dict): Visualiser-specific options, e.g. colors. | ||||||
|  |     manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. | ||||||
|     RETURNS (unicode): Rendered HTML markup. |     RETURNS (unicode): Rendered HTML markup. | ||||||
|     """ |     """ | ||||||
|     if isinstance(docs, Doc): |     factories = {'dep': (DependencyRenderer, parse_deps), | ||||||
|         docs = [docs] |                  'ent': (EntityRenderer, parse_ents)} | ||||||
|     if style == 'dep': |     if style not in factories: | ||||||
|         renderer = DependencyRenderer(options=options) |  | ||||||
|         parsed = [parse_deps(doc, options) for doc in docs] |  | ||||||
|     elif style == 'ent': |  | ||||||
|         renderer = EntityRenderer(options=options) |  | ||||||
|         parsed = [parse_ents(doc, options) for doc in docs] |  | ||||||
|     else: |  | ||||||
|         raise ValueError("Unknown style: %s" % style) |         raise ValueError("Unknown style: %s" % style) | ||||||
|  |     if isinstance(docs, Doc) or isinstance(docs, dict): | ||||||
|  |         docs = [docs] | ||||||
|  |     renderer, converter = factories[style] | ||||||
|  |     renderer = renderer(options=options) | ||||||
|  |     parsed = [converter(doc, options) for doc in docs] if not manual else docs | ||||||
|     _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() |     _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() | ||||||
|     html = _html['parsed'] |     html = _html['parsed'] | ||||||
|     if jupyter: # return HTML rendered by IPython display() |     if jupyter: # return HTML rendered by IPython display() | ||||||
|  | @ -39,7 +40,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti | ||||||
|     return html |     return html | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def serve(docs, style='dep', page=True, minify=False, options={}, port=5000): | def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, | ||||||
|  |           port=5000): | ||||||
|     """Serve displaCy visualisation. |     """Serve displaCy visualisation. | ||||||
| 
 | 
 | ||||||
|     docs (list or Doc): Document(s) to visualise. |     docs (list or Doc): Document(s) to visualise. | ||||||
|  | @ -47,10 +49,11 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000): | ||||||
|     page (bool): Render markup as full HTML page. |     page (bool): Render markup as full HTML page. | ||||||
|     minify (bool): Minify HTML markup. |     minify (bool): Minify HTML markup. | ||||||
|     options (dict): Visualiser-specific options, e.g. colors. |     options (dict): Visualiser-specific options, e.g. colors. | ||||||
|  |     manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. | ||||||
|     port (int): Port to serve visualisation. |     port (int): Port to serve visualisation. | ||||||
|     """ |     """ | ||||||
|     from wsgiref import simple_server |     from wsgiref import simple_server | ||||||
|     render(docs, style=style, page=page, minify=minify, options=options) |     render(docs, style=style, page=page, minify=minify, options=options, manual=manual) | ||||||
|     httpd = simple_server.make_server('0.0.0.0', port, app) |     httpd = simple_server.make_server('0.0.0.0', port, app) | ||||||
|     prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) |     prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) | ||||||
|     httpd.serve_forever() |     httpd.serve_forever() | ||||||
|  |  | ||||||
|  | @ -175,7 +175,7 @@ class EntityRenderer(object): | ||||||
|         minify (bool): Minify HTML markup. |         minify (bool): Minify HTML markup. | ||||||
|         RETURNS (unicode): Rendered HTML markup. |         RETURNS (unicode): Rendered HTML markup. | ||||||
|         """ |         """ | ||||||
|         rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed] |         rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed] | ||||||
|         if page: |         if page: | ||||||
|             docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) |             docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) | ||||||
|             markup = TPL_PAGE.format(content=docs) |             markup = TPL_PAGE.format(content=docs) | ||||||
|  |  | ||||||
|  | @ -54,6 +54,15 @@ p | ||||||
|         +cell #[+a("#options") Visualizer-specific options], e.g. colors. |         +cell #[+a("#options") Visualizer-specific options], e.g. colors. | ||||||
|         +cell #[code {}] |         +cell #[code {}] | ||||||
| 
 | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code manual] | ||||||
|  |         +cell bool | ||||||
|  |         +cell | ||||||
|  |             |  Don't parse #[code Doc] and instead, expect a dict or list of | ||||||
|  |             |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] | ||||||
|  |             |  for formats and examples. | ||||||
|  |         +cell #[code False] | ||||||
|  | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code port] |         +cell #[code port] | ||||||
|         +cell int |         +cell int | ||||||
|  | @ -111,6 +120,15 @@ p Render a dependency parse tree or named entity visualization. | ||||||
|         +cell #[+a("#options") Visualizer-specific options], e.g. colors. |         +cell #[+a("#options") Visualizer-specific options], e.g. colors. | ||||||
|         +cell #[code {}] |         +cell #[code {}] | ||||||
| 
 | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code manual] | ||||||
|  |         +cell bool | ||||||
|  |         +cell | ||||||
|  |             |  Don't parse #[code Doc] and instead, expect a dict or list of | ||||||
|  |             |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] | ||||||
|  |             |  for formats and examples. | ||||||
|  |         +cell #[code False] | ||||||
|  | 
 | ||||||
|     +footrow |     +footrow | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell unicode |         +cell unicode | ||||||
|  |  | ||||||
|  | @ -101,6 +101,9 @@ p | ||||||
|     |  directory. You can then use #[code spacy.load()] to load it via its |     |  directory. You can then use #[code spacy.load()] to load it via its | ||||||
|     |  package name, create a #[+a("#usage-link") shortcut link] to assign it a |     |  package name, create a #[+a("#usage-link") shortcut link] to assign it a | ||||||
|     |  custom name, or #[+a("usage-import") import it] explicitly as a module. |     |  custom name, or #[+a("usage-import") import it] explicitly as a module. | ||||||
|  |     |  If you need to download models as part of an automated process, we | ||||||
|  |     |  recommend using pip with a direct link, instead of relying on spaCy's | ||||||
|  |     |  #[+api("cli#download") #[code download]] command. | ||||||
| 
 | 
 | ||||||
| +h(3, "download-manual") Manual download and installation | +h(3, "download-manual") Manual download and installation | ||||||
| 
 | 
 | ||||||
|  | @ -162,6 +165,14 @@ p | ||||||
|     |  The #[+api("cli#link") #[code link]] command will create a symlink |     |  The #[+api("cli#link") #[code link]] command will create a symlink | ||||||
|     |  in the #[code spacy/data] directory. |     |  in the #[code spacy/data] directory. | ||||||
| 
 | 
 | ||||||
|  | +aside("Why does spaCy use symlinks?") | ||||||
|  |     |  Symlinks were originally introduced to maintain backwards compatibility, | ||||||
|  |     |  as older versions expected model data to live within #[code spacy/data]. | ||||||
|  |     |  However, we decided to keep using them in v2.0 instead of opting for | ||||||
|  |     |  a config file. There'll always be a need for assigning and saving custom | ||||||
|  |     |  model names or IDs. And your system already comes with a native solution | ||||||
|  |     |  to mapping unicode aliases to file paths: symbolic links. | ||||||
|  | 
 | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|     python -m spacy link [package name or path] [shortcut] [--force] |     python -m spacy link [package name or path] [shortcut] [--force] | ||||||
| 
 | 
 | ||||||
|  | @ -179,7 +190,7 @@ p | ||||||
|     python -m spacy link /Users/you/model my_amazing_model |     python -m spacy link /Users/you/model my_amazing_model | ||||||
| 
 | 
 | ||||||
| +infobox("Important note") | +infobox("Important note") | ||||||
|     |  In order to create a symlink, your user needs the required permissions. |     |  In order to create a symlink, your user needs the #[strong required permissions]. | ||||||
|     |  If you've installed spaCy to a system directory and don't have admin |     |  If you've installed spaCy to a system directory and don't have admin | ||||||
|     |  privileges, the #[code spacy link] command may fail. The easiest solution |     |  privileges, the #[code spacy link] command may fail. The easiest solution | ||||||
|     |  is to re-run the command as admin, or use a #[code virtualenv]. For more |     |  is to re-run the command as admin, or use a #[code virtualenv]. For more | ||||||
|  | @ -189,16 +200,26 @@ p | ||||||
| +h(3, "usage-import") Importing models as modules | +h(3, "usage-import") Importing models as modules | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  If you've installed a model via pip, you can also #[code import] it |     |  If you've installed a model via spaCy's downloader, or directly via pip, | ||||||
|     |  directly and then call its #[code load()] method with no arguments: |     |  you can also #[code import] it and then call its #[code load()] method | ||||||
|  |     |  with no arguments: | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     import spacy |  | ||||||
|     import en_core_web_md |     import en_core_web_md | ||||||
| 
 | 
 | ||||||
|     nlp = en_core_web_md.load() |     nlp = en_core_web_md.load() | ||||||
|     doc = nlp(u'This is a sentence.') |     doc = nlp(u'This is a sentence.') | ||||||
| 
 | 
 | ||||||
|  | p | ||||||
|  |     |  How you choose to load your models ultimately depends on personal | ||||||
|  |     |  preference. However, #[strong for larger code bases], we usually recommend | ||||||
|  |     |  native imports, as this will make it easier to integrate models with your | ||||||
|  |     |  existing build process, continuous integration workflow and testing | ||||||
|  |     |  framework. It'll also prevent you from ever trying to load a model that | ||||||
|  |     |  is not installed, as your code will raise an #[code ImportError] | ||||||
|  |     |  immediately, instead of failing somewhere down the line when calling | ||||||
|  |     |  #[code spacy.load()]. | ||||||
|  | 
 | ||||||
| +h(2, "own-models") Using your own models | +h(2, "own-models") Using your own models | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  |  | ||||||
|  | @ -20,13 +20,13 @@ p | ||||||
| 
 | 
 | ||||||
| +list("numbers") | +list("numbers") | ||||||
|     +item |     +item | ||||||
|         |  A token whose #[strong lower-case form matches "hello"], e.g. "Hello" |         |  A token whose #[strong lowercase form matches "hello"], e.g. "Hello" | ||||||
|         |  or "HELLO". |         |  or "HELLO". | ||||||
|     +item |     +item | ||||||
|         |  A token whose #[strong #[code is_punct] flag is set to #[code True]], |         |  A token whose #[strong #[code is_punct] flag is set to #[code True]], | ||||||
|         |  i.e. any punctuation. |         |  i.e. any punctuation. | ||||||
|     +item |     +item | ||||||
|         |  A token whose #[strong lower-case form matches "world"], e.g. "World" |         |  A token whose #[strong lowercase form matches "world"], e.g. "World" | ||||||
|         |  or "WORLD". |         |  or "WORLD". | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|  | @ -95,10 +95,6 @@ p | ||||||
|     nlp = spacy.load('en') |     nlp = spacy.load('en') | ||||||
|     matcher = Matcher(nlp.vocab) |     matcher = Matcher(nlp.vocab) | ||||||
| 
 | 
 | ||||||
|     matcher.add('GoogleIO', on_match=add_event_ent, |  | ||||||
|                 [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}], |  | ||||||
|                 [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}]) |  | ||||||
| 
 |  | ||||||
|     # Get the ID of the 'EVENT' entity type. This is required to set an entity. |     # Get the ID of the 'EVENT' entity type. This is required to set an entity. | ||||||
|     EVENT = nlp.vocab.strings['EVENT'] |     EVENT = nlp.vocab.strings['EVENT'] | ||||||
| 
 | 
 | ||||||
|  | @ -108,6 +104,10 @@ p | ||||||
|         match_id, start, end = matches[i] |         match_id, start, end = matches[i] | ||||||
|         doc.ents += ((EVENT, start, end),) |         doc.ents += ((EVENT, start, end),) | ||||||
| 
 | 
 | ||||||
|  |     matcher.add('GoogleIO', on_match=add_event_ent, | ||||||
|  |                 [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}], | ||||||
|  |                 [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}]) | ||||||
|  | 
 | ||||||
| p | p | ||||||
|     |  In addition to mentions of "Google I/O", your data also contains some |     |  In addition to mentions of "Google I/O", your data also contains some | ||||||
|     |  annoying pre-processing artefacts, like leftover HTML line breaks |     |  annoying pre-processing artefacts, like leftover HTML line breaks | ||||||
|  | @ -117,10 +117,6 @@ p | ||||||
|     |  function #[code merge_and_flag]: |     |  function #[code merge_and_flag]: | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     matcher.add('BAD_HTML', on_match=merge_and_flag, |  | ||||||
|                 [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], |  | ||||||
|                 [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}]) |  | ||||||
| 
 |  | ||||||
|     # Add a new custom flag to the vocab, which is always False by default. |     # Add a new custom flag to the vocab, which is always False by default. | ||||||
|     # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. |     # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. | ||||||
|     BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) |     BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) | ||||||
|  | @ -131,6 +127,10 @@ p | ||||||
|         span.merge(is_stop=True) # merge (and mark it as a stop word, just in case) |         span.merge(is_stop=True) # merge (and mark it as a stop word, just in case) | ||||||
|         span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG |         span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG | ||||||
| 
 | 
 | ||||||
|  |     matcher.add('BAD_HTML', on_match=merge_and_flag, | ||||||
|  |                 [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], | ||||||
|  |                 [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}]) | ||||||
|  | 
 | ||||||
| +aside("Tip: Visualizing matches") | +aside("Tip: Visualizing matches") | ||||||
|     |  When working with entities, you can use #[+api("displacy") displaCy] |     |  When working with entities, you can use #[+api("displacy") displaCy] | ||||||
|     |  to quickly generate a NER visualization from your updated #[code Doc], |     |  to quickly generate a NER visualization from your updated #[code Doc], | ||||||
|  | @ -146,18 +146,16 @@ p | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  We can now call the matcher on our documents. The patterns will be |     |  We can now call the matcher on our documents. The patterns will be | ||||||
|     |  matched in the order they occur in the text. |     |  matched in the order they occur in the text. The matcher will then | ||||||
|  |     |  iterate over the matches, look up the callback for the match ID | ||||||
|  |     |  that was matched, and invoke it. | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     doc = nlp(LOTS_OF_TEXT) |     doc = nlp(LOTS_OF_TEXT) | ||||||
|     matcher(doc) |     matcher(doc) | ||||||
| 
 | 
 | ||||||
| +h(3, "on_match-callback") The callback function |  | ||||||
| 
 |  | ||||||
| p | p | ||||||
|     |  The matcher will first collect all matches over the document. It will |     |  When the callback is invoked, it is | ||||||
|     |  then iterate over the matches, lookup the callback for the entity ID |  | ||||||
|     |  that was matched, and invoke it. When the callback is invoked, it is |  | ||||||
|     |  passed four arguments: the matcher itself, the document, the position of |     |  passed four arguments: the matcher itself, the document, the position of | ||||||
|     |  the current match, and the total list of matches. This allows you to |     |  the current match, and the total list of matches. This allows you to | ||||||
|     |  write callbacks that consider the entire set of matched phrases, so that |     |  write callbacks that consider the entire set of matched phrases, so that | ||||||
|  | @ -185,11 +183,24 @@ p | ||||||
|         +cell |         +cell | ||||||
|             |  A list of #[code (match_id, start, end)] tuples, describing the |             |  A list of #[code (match_id, start, end)] tuples, describing the | ||||||
|             |  matches. A match tuple describes a span #[code doc[start:end]]. |             |  matches. A match tuple describes a span #[code doc[start:end]]. | ||||||
|             |  The #[code match_id] is the ID of the added match pattern. |  | ||||||
| 
 | 
 | ||||||
| +h(2, "quantifiers") Using quantifiers | +h(2, "quantifiers") Using operators and quantifiers | ||||||
| 
 | 
 | ||||||
| +table([ "Name", "Description", "Example"]) | p | ||||||
|  |     |  The matcher also lets you use quantifiers, specified as the #[code 'OP'] | ||||||
|  |     |  key. Quantifiers let you define sequences of tokens to be mached, e.g. | ||||||
|  |     |  one or more punctuation marks, or specify optional tokens. Note that there | ||||||
|  |     |  are no nested or scoped quantifiers – instead, you can build those | ||||||
|  |     |  behaviours with #[code on_match] callbacks. | ||||||
|  | 
 | ||||||
|  | +aside("Problems with quantifiers") | ||||||
|  |     |  Using quantifiers may lead to unexpected results when matching | ||||||
|  |     |  variable-length patterns, for example if the next token would also be | ||||||
|  |     |  matched by the previous token. This problem should be resolved in a future | ||||||
|  |     |  release. For more information, see | ||||||
|  |     |  #[+a(gh("spaCy") + "/issues/864") this issue]. | ||||||
|  | 
 | ||||||
|  | +table([ "OP", "Description", "Example"]) | ||||||
|     +row |     +row | ||||||
|         +cell #[code !] |         +cell #[code !] | ||||||
|         +cell match exactly 0 times |         +cell match exactly 0 times | ||||||
|  | @ -210,6 +221,103 @@ p | ||||||
|         +cell match 0 or 1 times |         +cell match 0 or 1 times | ||||||
|         +cell optional, max one |         +cell optional, max one | ||||||
| 
 | 
 | ||||||
|  | +h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations | ||||||
|  | 
 | ||||||
| p | p | ||||||
|     |  There are no nested or scoped quantifiers. You can build those |     |  Let's say you're analysing user comments and you want to find out what | ||||||
|     |  behaviours with #[code on_match] callbacks. |     |  people are saying about Facebook. You want to start off by finding | ||||||
|  |     |  adjectives following "Facebook is" or "Facebook was". This is obviously | ||||||
|  |     |  a very rudimentary solution, but it'll be fast, and a great way get an | ||||||
|  |     |  idea for what's in your data. Your pattern could look like this: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'}, {'POS': 'ADJ'}] | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This translates to a token whose lowercase form matches "facebook" | ||||||
|  |     |  (like Facebook, facebook or FACEBOOK), followed by a token with the lemma | ||||||
|  |     |  "be" (for example, is, was, or 's), followed by an #[strong optional] adverb, | ||||||
|  |     |  followed by an adjective. Using the linguistic annotations here is | ||||||
|  |     |  especially useful, because you can tell spaCy to match "Facebook's | ||||||
|  |     |  annoying", but #[strong not] "Facebook's annoying ads". The optional | ||||||
|  |     |  adverb makes sure you won't miss adjectives with intensifiers, like | ||||||
|  |     |  "pretty awful" or "very nice". | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To get a quick overview of the results, you could collect all sentences | ||||||
|  |     |  containing a match and render them with the | ||||||
|  |     |  #[+a("/docs/usage/visualizers") displaCy visualizer]. | ||||||
|  |     |  In the callback function, you'll have access to the #[code start] and | ||||||
|  |     |  #[code end] of each match, as well as the parent #[code Doc]. This lets | ||||||
|  |     |  you determine the sentence containing the match, | ||||||
|  |     |  #[code doc[start : end].sent], and calculate the start and end of the | ||||||
|  |     |  matched span within the sentence. Using displaCy in | ||||||
|  |     |  #[+a("/docs/usage/visualizers#manual-usage") "manual" mode] lets you | ||||||
|  |     |  pass in a list of dictionaries containing the text and entities to render. | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     from spacy import displacy | ||||||
|  |     from spacy.matcher import Matcher | ||||||
|  | 
 | ||||||
|  |     nlp = spacy.load('en') | ||||||
|  |     matcher = Matcher(nlp.vocab) | ||||||
|  |     matched_sents = [] # collect data of matched sentences to be visualized | ||||||
|  | 
 | ||||||
|  |     def collect_sents(matcher, doc, i, matches): | ||||||
|  |         match_id, start, end = matches[i] | ||||||
|  |         span = doc[start : end] # matched span | ||||||
|  |         sent = span.sent # sentence containing matched span | ||||||
|  |         # append mock entity for match in displaCy style to matched_sents | ||||||
|  |         # get the match span by ofsetting the start and end of the span with the | ||||||
|  |         # start and end of the sentence in the doc | ||||||
|  |         match_ents = [{'start': span.start-sent.start, 'end': span.end-sent.start, | ||||||
|  |                        'label': 'MATCH'}] | ||||||
|  |         matched_sents.append({'text': sent.text, 'ents': match_ents }) | ||||||
|  | 
 | ||||||
|  |     pattern = [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'}, | ||||||
|  |                {'POS': 'ADJ'}] | ||||||
|  |     matcher.add('FacebookIs', collect_sents, pattern) # add pattern | ||||||
|  |     matches = matcher(nlp(LOTS_OF_TEXT)) # match on your text | ||||||
|  | 
 | ||||||
|  |     # serve visualization of sentences containing match with displaCy | ||||||
|  |     # set manual=True to make displaCy render straight from a dictionary | ||||||
|  |     displacy.serve(matched_sents, style='ent', manual=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +h(3, "quantifiers-example2") Quantifiers example: Phone numbers | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Phone numbers can have many different formats and matching them is often | ||||||
|  |     |  tricky. During tokenization, spaCy will leave sequences of numbers intact | ||||||
|  |     |  and only split on whitespace and punctuation. This means that your match | ||||||
|  |     |  pattern will have to look out for number sequences of a certain length, | ||||||
|  |     |  surrounded by specific punctuation – depending on the | ||||||
|  |     |  #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers") national conventions]. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  The #[code IS_DIGIT] flag is not very helpful here, because it doesn't | ||||||
|  |     |  tell us anything about the length. However, you can use the #[code SHAPE] | ||||||
|  |     |  flag, with each #[code d] representing a digit: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     [{'ORTH': '('}, {'SHAPE': 'ddd'}, {'ORTH': ')'}, {'SHAPE': 'dddd'}, | ||||||
|  |      {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}] | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This will match phone numbers of the format #[strong (123) 4567 8901] or | ||||||
|  |     |  #[strong (123) 4567-8901]. To also match formats like #[strong (123) 456 789], | ||||||
|  |     |  you can add a second pattern using #[code 'ddd'] in place of #[code 'dddd']. | ||||||
|  |     |  By hard-coding some values, you can match only certain, country-specific | ||||||
|  |     |  numbers. For example, here's a pattern to match the most common formats of | ||||||
|  |     |  #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Germany") international German numbers]: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     [{'ORTH': '+'}, {'ORTH': '49'}, {'ORTH': '(', 'OP': '?'}, {'SHAPE': 'dddd'}, | ||||||
|  |      {'ORTH': ')', 'OP': '?'}, {'SHAPE': 'dddddd'}] | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Depending on the formats your application needs to match, creating an | ||||||
|  |     |  extensive set of rules like this is often better than training a model. | ||||||
|  |     |  It'll produce more predictable results, is much easier to modify and | ||||||
|  |     |  extend, and doesn't require any training data – only a set of | ||||||
|  |     |  test cases. | ||||||
|  |  | ||||||
|  | @ -287,24 +287,17 @@ p | ||||||
|     |  #[+a("http://www.nltk.org") NLTK] or |     |  #[+a("http://www.nltk.org") NLTK] or | ||||||
|     |  #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet]. |     |  #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet]. | ||||||
|     |  Simply convert the dependency parse or recognised entities to displaCy's |     |  Simply convert the dependency parse or recognised entities to displaCy's | ||||||
|     |  format and import #[code DependencyRenderer] or #[code EntityRenderer] |     |  format and set #[code manual=True] on either #[code render()] or | ||||||
|     |  from #[code spacy.displacy.render]. A renderer class can be is initialised |     |  #[code serve()]. | ||||||
|     |  with a dictionary of options. To generate the visualization markup, call |  | ||||||
|     |  the renderer's #[code render()] method on a list of dictionaries (one |  | ||||||
|     |  per visualization). |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     from spacy.displacy.render import EntityRenderer |  | ||||||
| 
 |  | ||||||
|     ex = [{'text': 'But Google is starting from behind.', |     ex = [{'text': 'But Google is starting from behind.', | ||||||
|            'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], |            'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], | ||||||
|            'title': None}] |            'title': None}] | ||||||
|     renderer = EntityRenderer() |     html = displacy.render(ex, style='ent', manual=True) | ||||||
|     html = renderer.render(ex) |  | ||||||
| 
 | 
 | ||||||
| +code("DependencyRenderer input"). | +code("DEP input"). | ||||||
|     [{ |     { | ||||||
|         'words': [ |         'words': [ | ||||||
|             {'text': 'This', 'tag': 'DT'}, |             {'text': 'This', 'tag': 'DT'}, | ||||||
|             {'text': 'is', 'tag': 'VBZ'}, |             {'text': 'is', 'tag': 'VBZ'}, | ||||||
|  | @ -314,11 +307,10 @@ p | ||||||
|             {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, |             {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, | ||||||
|             {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, |             {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, | ||||||
|             {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] |             {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] | ||||||
|     }] |     } | ||||||
| 
 | 
 | ||||||
| +code("EntityRenderer input"). | +code("ENT input"). | ||||||
|     [{ |     { | ||||||
|         'text': 'But Google is starting from behind.', |         'text': 'But Google is starting from behind.', | ||||||
|         'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], |         'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], | ||||||
|         'title': None |         'title': None | ||||||
|     }] |  | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user