2016-10-28 18:42:00 +03:00
|
|
|
# cython: infer_types=True
|
2018-03-27 20:23:02 +03:00
|
|
|
# cython: profile=True
|
2015-10-08 18:00:45 +03:00
|
|
|
from __future__ import unicode_literals
|
2015-08-05 02:05:54 +03:00
|
|
|
from libcpp.vector cimport vector
|
2018-03-27 20:23:02 +03:00
|
|
|
from libc.stdint cimport int32_t, uint64_t, uint16_t
|
|
|
|
from preshed.maps cimport PreshMap
|
|
|
|
from cymem.cymem cimport Pool
|
2015-10-08 18:00:45 +03:00
|
|
|
from murmurhash.mrmr cimport hash64
|
2018-03-27 20:23:02 +03:00
|
|
|
from .typedefs cimport attr_t, hash_t
|
2017-10-27 22:07:59 +03:00
|
|
|
from .structs cimport TokenC
|
2018-03-27 20:23:02 +03:00
|
|
|
from .lexeme cimport attr_id_t
|
2015-08-05 02:05:54 +03:00
|
|
|
from .vocab cimport Vocab
|
2018-03-27 20:23:02 +03:00
|
|
|
from .tokens.doc cimport Doc
|
|
|
|
from .tokens.doc cimport get_token_attr
|
2018-11-15 05:00:58 +03:00
|
|
|
from .attrs cimport ID, attr_id_t, NULL_ATTR, ORTH
|
2018-11-14 21:10:46 +03:00
|
|
|
from .errors import Errors, TempErrors, Warnings, deprecation_warning
|
2015-08-04 16:55:28 +03:00
|
|
|
|
2017-10-27 22:07:59 +03:00
|
|
|
from .attrs import IDS
|
2015-10-08 18:00:45 +03:00
|
|
|
from .attrs import FLAG61 as U_ENT
|
|
|
|
from .attrs import FLAG60 as B2_ENT
|
|
|
|
from .attrs import FLAG59 as B3_ENT
|
|
|
|
from .attrs import FLAG58 as B4_ENT
|
|
|
|
from .attrs import FLAG43 as L2_ENT
|
|
|
|
from .attrs import FLAG42 as L3_ENT
|
|
|
|
from .attrs import FLAG41 as L4_ENT
|
2018-07-06 13:29:23 +03:00
|
|
|
from .attrs import FLAG43 as I2_ENT
|
|
|
|
from .attrs import FLAG42 as I3_ENT
|
|
|
|
from .attrs import FLAG41 as I4_ENT
|
|
|
|
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
DELIMITER = '||'
|
2015-10-08 18:00:45 +03:00
|
|
|
|
2018-09-05 06:53:21 +03:00
|
|
|
DELIMITER = '||'
|
2018-10-30 01:21:39 +03:00
|
|
|
INDEX_HEAD = 1
|
|
|
|
INDEX_RELOP = 0
|
2015-10-08 18:00:45 +03:00
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
cdef enum action_t:
|
|
|
|
REJECT = 0000
|
|
|
|
MATCH = 1000
|
|
|
|
ADVANCE = 0100
|
|
|
|
RETRY = 0010
|
|
|
|
RETRY_EXTEND = 0011
|
|
|
|
MATCH_EXTEND = 1001
|
|
|
|
MATCH_REJECT = 2000
|
|
|
|
|
|
|
|
|
|
|
|
cdef enum quantifier_t:
|
2016-09-21 15:54:55 +03:00
|
|
|
ZERO
|
|
|
|
ZERO_ONE
|
|
|
|
ZERO_PLUS
|
2018-03-27 20:23:02 +03:00
|
|
|
ONE
|
|
|
|
ONE_PLUS
|
2016-09-21 15:54:55 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef struct AttrValueC:
|
2015-08-05 02:05:54 +03:00
|
|
|
attr_id_t attr
|
|
|
|
attr_t value
|
2015-08-04 16:55:28 +03:00
|
|
|
|
2016-09-21 15:54:55 +03:00
|
|
|
cdef struct TokenPatternC:
|
|
|
|
AttrValueC* attrs
|
|
|
|
int32_t nr_attr
|
|
|
|
quantifier_t quantifier
|
2018-03-27 20:23:02 +03:00
|
|
|
hash_t key
|
|
|
|
|
|
|
|
|
|
|
|
cdef struct PatternStateC:
|
|
|
|
TokenPatternC* pattern
|
|
|
|
int32_t start
|
|
|
|
int32_t length
|
|
|
|
|
|
|
|
|
|
|
|
cdef struct MatchC:
|
|
|
|
attr_t pattern_id
|
|
|
|
int32_t start
|
|
|
|
int32_t length
|
|
|
|
|
|
|
|
|
|
|
|
cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
|
|
|
|
cdef vector[PatternStateC] states
|
|
|
|
cdef vector[MatchC] matches
|
|
|
|
cdef PatternStateC state
|
|
|
|
cdef Pool mem = Pool()
|
|
|
|
# TODO: Prefill this with the extra attribute values.
|
|
|
|
extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
|
|
|
|
# Main loop
|
|
|
|
cdef int i, j
|
|
|
|
for i in range(doc.length):
|
|
|
|
for j in range(n):
|
|
|
|
states.push_back(PatternStateC(patterns[j], i, 0))
|
|
|
|
transition_states(states, matches, &doc.c[i], extra_attrs[i])
|
|
|
|
# Handle matches that end in 0-width patterns
|
|
|
|
finish_states(matches, states)
|
|
|
|
return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
|
|
|
|
for i in range(matches.size())]
|
|
|
|
|
|
|
|
|
2018-08-15 17:19:08 +03:00
|
|
|
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
|
|
|
# The code was originally designed to always have pattern[1].attrs.value
|
|
|
|
# be the ent_id when we get to the end of a pattern. However, Issue #2671
|
|
|
|
# showed this wasn't the case when we had a reject-and-continue before a
|
|
|
|
# match. I still don't really understand what's going on here, but this
|
|
|
|
# workaround does resolve the issue.
|
|
|
|
while pattern.attrs.attr != ID and pattern.nr_attr > 0:
|
|
|
|
pattern += 1
|
|
|
|
return pattern.attrs.value
|
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
|
|
|
|
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
|
|
|
const TokenC* token, const attr_t* extra_attrs) except *:
|
|
|
|
cdef int q = 0
|
|
|
|
cdef vector[PatternStateC] new_states
|
|
|
|
for i in range(states.size()):
|
|
|
|
action = get_action(states[i], token, extra_attrs)
|
|
|
|
if action == REJECT:
|
|
|
|
continue
|
|
|
|
state = states[i]
|
|
|
|
states[q] = state
|
|
|
|
while action in (RETRY, RETRY_EXTEND):
|
|
|
|
if action == RETRY_EXTEND:
|
|
|
|
new_states.push_back(
|
|
|
|
PatternStateC(pattern=state.pattern, start=state.start,
|
|
|
|
length=state.length+1))
|
|
|
|
states[q].pattern += 1
|
|
|
|
action = get_action(states[q], token, extra_attrs)
|
|
|
|
if action == REJECT:
|
|
|
|
pass
|
|
|
|
elif action == ADVANCE:
|
|
|
|
states[q].pattern += 1
|
|
|
|
states[q].length += 1
|
|
|
|
q += 1
|
|
|
|
else:
|
2018-08-15 17:19:08 +03:00
|
|
|
ent_id = get_ent_id(&state.pattern[1])
|
2018-03-27 20:23:02 +03:00
|
|
|
if action == MATCH:
|
|
|
|
matches.push_back(
|
|
|
|
MatchC(pattern_id=ent_id, start=state.start,
|
|
|
|
length=state.length+1))
|
|
|
|
elif action == MATCH_REJECT:
|
|
|
|
matches.push_back(
|
|
|
|
MatchC(pattern_id=ent_id, start=state.start,
|
|
|
|
length=state.length))
|
|
|
|
elif action == MATCH_EXTEND:
|
|
|
|
matches.push_back(
|
|
|
|
MatchC(pattern_id=ent_id, start=state.start,
|
|
|
|
length=state.length))
|
|
|
|
states[q].length += 1
|
|
|
|
q += 1
|
|
|
|
states.resize(q)
|
|
|
|
for i in range(new_states.size()):
|
|
|
|
states.push_back(new_states[i])
|
|
|
|
|
|
|
|
|
|
|
|
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
|
|
|
|
'''Handle states that end in zero-width patterns.'''
|
|
|
|
cdef PatternStateC state
|
|
|
|
for i in range(states.size()):
|
|
|
|
state = states[i]
|
|
|
|
while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
|
|
|
|
is_final = get_is_final(state)
|
|
|
|
if is_final:
|
2018-08-15 17:19:08 +03:00
|
|
|
ent_id = get_ent_id(state.pattern)
|
2018-03-27 20:23:02 +03:00
|
|
|
matches.push_back(
|
|
|
|
MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
state.pattern += 1
|
|
|
|
|
|
|
|
|
|
|
|
cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
|
|
|
|
'''We need to consider:
|
|
|
|
|
|
|
|
a) Does the token match the specification? [Yes, No]
|
|
|
|
b) What's the quantifier? [1, 0+, ?]
|
|
|
|
c) Is this the last specification? [final, non-final]
|
|
|
|
|
|
|
|
We can transition in the following ways:
|
|
|
|
|
|
|
|
a) Do we emit a match?
|
|
|
|
b) Do we add a state with (next state, next token)?
|
|
|
|
c) Do we add a state with (next state, same token)?
|
|
|
|
d) Do we add a state with (same state, next token)?
|
|
|
|
|
|
|
|
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
|
|
|
1000 means match but no states added, etc.
|
2018-11-14 21:10:21 +03:00
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
1:
|
|
|
|
Yes, final:
|
|
|
|
1000
|
|
|
|
Yes, non-final:
|
|
|
|
0100
|
|
|
|
No, final:
|
|
|
|
0000
|
|
|
|
No, non-final
|
|
|
|
0000
|
|
|
|
0+:
|
|
|
|
Yes, final:
|
|
|
|
1001
|
|
|
|
Yes, non-final:
|
|
|
|
0011
|
|
|
|
No, final:
|
|
|
|
1000 (note: Don't include last token!)
|
|
|
|
No, non-final:
|
|
|
|
0010
|
|
|
|
?:
|
|
|
|
Yes, final:
|
|
|
|
1000
|
|
|
|
Yes, non-final:
|
|
|
|
0100
|
|
|
|
No, final:
|
|
|
|
1000 (note: Don't include last token!)
|
|
|
|
No, non-final:
|
|
|
|
0010
|
|
|
|
|
2018-11-14 21:10:21 +03:00
|
|
|
Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010,
|
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
We'll name the bits "match", "advance", "retry", "extend"
|
|
|
|
REJECT = 0000
|
|
|
|
MATCH = 1000
|
|
|
|
ADVANCE = 0100
|
|
|
|
RETRY = 0010
|
|
|
|
MATCH_EXTEND = 1001
|
|
|
|
RETRY_EXTEND = 0011
|
|
|
|
MATCH_REJECT = 2000 # Match, but don't include last token
|
|
|
|
|
|
|
|
Problem: If a quantifier is matching, we're adding a lot of open partials
|
|
|
|
'''
|
|
|
|
cdef char is_match
|
|
|
|
is_match = get_is_match(state, token, extra_attrs)
|
|
|
|
quantifier = get_quantifier(state)
|
|
|
|
is_final = get_is_final(state)
|
|
|
|
if quantifier == ZERO:
|
|
|
|
is_match = not is_match
|
|
|
|
quantifier = ONE
|
|
|
|
if quantifier == ONE:
|
|
|
|
if is_match and is_final:
|
|
|
|
# Yes, final: 1000
|
|
|
|
return MATCH
|
|
|
|
elif is_match and not is_final:
|
|
|
|
# Yes, non-final: 0100
|
|
|
|
return ADVANCE
|
|
|
|
elif not is_match and is_final:
|
|
|
|
# No, final: 0000
|
|
|
|
return REJECT
|
|
|
|
else:
|
|
|
|
return REJECT
|
|
|
|
elif quantifier == ZERO_PLUS:
|
|
|
|
if is_match and is_final:
|
|
|
|
# Yes, final: 1001
|
|
|
|
return MATCH_EXTEND
|
|
|
|
elif is_match and not is_final:
|
|
|
|
# Yes, non-final: 0011
|
|
|
|
return RETRY_EXTEND
|
|
|
|
elif not is_match and is_final:
|
|
|
|
# No, final 2000 (note: Don't include last token!)
|
|
|
|
return MATCH_REJECT
|
|
|
|
else:
|
|
|
|
# No, non-final 0010
|
|
|
|
return RETRY
|
|
|
|
elif quantifier == ZERO_ONE:
|
|
|
|
if is_match and is_final:
|
|
|
|
# Yes, final: 1000
|
|
|
|
return MATCH
|
|
|
|
elif is_match and not is_final:
|
|
|
|
# Yes, non-final: 0100
|
|
|
|
return ADVANCE
|
|
|
|
elif not is_match and is_final:
|
|
|
|
# No, final 2000 (note: Don't include last token!)
|
|
|
|
return MATCH_REJECT
|
|
|
|
else:
|
|
|
|
# No, non-final 0010
|
|
|
|
return RETRY
|
|
|
|
|
|
|
|
|
|
|
|
cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
|
|
|
|
spec = state.pattern
|
|
|
|
for attr in spec.attrs[:spec.nr_attr]:
|
|
|
|
if get_token_attr(token, attr.attr) != attr.value:
|
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
cdef char get_is_final(PatternStateC state) nogil:
|
|
|
|
if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
|
|
|
|
return 1
|
|
|
|
else:
|
|
|
|
return 0
|
2015-08-04 16:55:28 +03:00
|
|
|
|
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
cdef char get_quantifier(PatternStateC state) nogil:
|
|
|
|
return state.pattern.quantifier
|
2016-09-21 15:54:55 +03:00
|
|
|
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
2018-09-05 06:53:21 +03:00
|
|
|
DEF PADDING = 5
|
|
|
|
|
2016-09-21 15:54:55 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
2016-10-17 16:23:31 +03:00
|
|
|
object token_specs) except NULL:
|
2016-09-21 15:54:55 +03:00
|
|
|
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
|
2015-08-05 02:05:54 +03:00
|
|
|
cdef int i
|
2016-09-21 15:54:55 +03:00
|
|
|
for i, (quantifier, spec) in enumerate(token_specs):
|
|
|
|
pattern[i].quantifier = quantifier
|
|
|
|
pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
|
|
|
|
pattern[i].nr_attr = len(spec)
|
2015-08-05 02:05:54 +03:00
|
|
|
for j, (attr, value) in enumerate(spec):
|
2016-09-21 15:54:55 +03:00
|
|
|
pattern[i].attrs[j].attr = attr
|
|
|
|
pattern[i].attrs[j].value = value
|
2018-03-27 20:23:02 +03:00
|
|
|
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
|
2015-08-05 02:05:54 +03:00
|
|
|
i = len(token_specs)
|
2017-05-20 14:54:53 +03:00
|
|
|
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
2016-09-21 15:54:55 +03:00
|
|
|
pattern[i].attrs[0].attr = ID
|
|
|
|
pattern[i].attrs[0].value = entity_id
|
|
|
|
pattern[i].nr_attr = 0
|
2015-08-05 02:05:54 +03:00
|
|
|
return pattern
|
|
|
|
|
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
|
2017-05-20 14:54:53 +03:00
|
|
|
while pattern.nr_attr != 0:
|
|
|
|
pattern += 1
|
|
|
|
id_attr = pattern[0].attrs[0]
|
2018-04-03 16:50:31 +03:00
|
|
|
if id_attr.attr != ID:
|
2018-04-29 16:48:34 +03:00
|
|
|
with gil:
|
|
|
|
raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
|
2017-05-20 14:54:53 +03:00
|
|
|
return id_attr.value
|
|
|
|
|
2015-08-06 15:33:21 +03:00
|
|
|
def _convert_strings(token_specs, string_store):
|
2016-09-21 15:54:55 +03:00
|
|
|
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
2018-03-27 20:23:02 +03:00
|
|
|
operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
|
|
|
'?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
|
2016-09-21 15:54:55 +03:00
|
|
|
tokens = []
|
|
|
|
op = ONE
|
2015-08-06 15:33:21 +03:00
|
|
|
for spec in token_specs:
|
2017-10-07 04:36:15 +03:00
|
|
|
if not spec:
|
|
|
|
# Signifier for 'any token'
|
|
|
|
tokens.append((ONE, [(NULL_ATTR, 0)]))
|
|
|
|
continue
|
2016-11-08 19:14:26 +03:00
|
|
|
token = []
|
2016-09-21 15:54:55 +03:00
|
|
|
ops = (ONE,)
|
2015-08-06 15:33:21 +03:00
|
|
|
for attr, value in spec.items():
|
2016-09-21 15:54:55 +03:00
|
|
|
if isinstance(attr, basestring) and attr.upper() == 'OP':
|
|
|
|
if value in operators:
|
|
|
|
ops = operators[value]
|
|
|
|
else:
|
2018-04-03 16:50:31 +03:00
|
|
|
keys = ', '.join(operators.keys())
|
|
|
|
raise KeyError(Errors.E011.format(op=value, opts=keys))
|
2015-08-06 15:33:21 +03:00
|
|
|
if isinstance(attr, basestring):
|
2017-10-27 22:07:59 +03:00
|
|
|
attr = IDS.get(attr.upper())
|
2015-08-06 15:33:21 +03:00
|
|
|
if isinstance(value, basestring):
|
2017-05-28 16:10:22 +03:00
|
|
|
value = string_store.add(value)
|
2015-09-06 18:53:12 +03:00
|
|
|
if isinstance(value, bool):
|
|
|
|
value = int(value)
|
2016-04-14 11:37:39 +03:00
|
|
|
if attr is not None:
|
2016-09-21 15:54:55 +03:00
|
|
|
token.append((attr, value))
|
|
|
|
for op in ops:
|
|
|
|
tokens.append((op, token))
|
|
|
|
return tokens
|
2015-10-08 18:00:45 +03:00
|
|
|
|
|
|
|
|
2015-08-05 02:05:54 +03:00
|
|
|
cdef class Matcher:
|
2017-05-19 22:47:06 +03:00
|
|
|
"""Match sequences of tokens, based on pattern rules."""
|
2015-08-05 02:05:54 +03:00
|
|
|
cdef Pool mem
|
2016-09-21 15:54:55 +03:00
|
|
|
cdef vector[TokenPatternC*] patterns
|
2015-08-26 20:17:02 +03:00
|
|
|
cdef readonly Vocab vocab
|
2016-09-24 12:20:42 +03:00
|
|
|
cdef public object _patterns
|
2016-10-17 16:23:31 +03:00
|
|
|
cdef public object _entities
|
|
|
|
cdef public object _callbacks
|
2016-11-08 19:14:26 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def __init__(self, vocab):
|
2017-05-19 22:47:06 +03:00
|
|
|
"""Create the Matcher.
|
|
|
|
|
|
|
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
|
|
|
documents the matcher will operate on.
|
|
|
|
RETURNS (Matcher): The newly constructed object.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2016-10-17 16:23:31 +03:00
|
|
|
self._patterns = {}
|
|
|
|
self._entities = {}
|
|
|
|
self._callbacks = {}
|
2015-10-12 11:33:11 +03:00
|
|
|
self.vocab = vocab
|
|
|
|
self.mem = Pool()
|
|
|
|
|
2016-10-17 17:49:43 +03:00
|
|
|
def __reduce__(self):
|
2018-01-24 17:42:11 +03:00
|
|
|
data = (self.vocab, self._patterns, self._callbacks)
|
|
|
|
return (unpickle_matcher, data, None, None)
|
2016-11-08 19:14:26 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def __len__(self):
|
2017-05-20 15:32:34 +03:00
|
|
|
"""Get the number of rules added to the matcher. Note that this only
|
|
|
|
returns the number of rules (identical with the number of IDs), not the
|
|
|
|
number of individual patterns.
|
2017-05-20 15:26:10 +03:00
|
|
|
|
|
|
|
RETURNS (int): The number of rules.
|
|
|
|
"""
|
2017-05-20 14:54:53 +03:00
|
|
|
return len(self._patterns)
|
|
|
|
|
|
|
|
def __contains__(self, key):
|
2017-05-20 15:26:10 +03:00
|
|
|
"""Check whether the matcher contains rules for a match ID.
|
|
|
|
|
|
|
|
key (unicode): The match ID.
|
|
|
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
|
|
|
"""
|
2017-10-25 17:19:38 +03:00
|
|
|
return self._normalize_key(key) in self._patterns
|
2017-05-20 14:54:53 +03:00
|
|
|
|
2017-05-23 12:37:40 +03:00
|
|
|
def add(self, key, on_match, *patterns):
|
2017-10-27 22:07:59 +03:00
|
|
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
|
|
|
key, an on_match callback, and one or more patterns.
|
2017-10-16 14:38:20 +03:00
|
|
|
|
|
|
|
If the key exists, the patterns are appended to the previous ones, and
|
2017-10-27 22:07:59 +03:00
|
|
|
the previous on_match callback is replaced. The `on_match` callback
|
|
|
|
will receive the arguments `(matcher, doc, i, matches)`. You can also
|
|
|
|
set `on_match` to `None` to not perform any actions.
|
2017-10-16 14:38:20 +03:00
|
|
|
|
|
|
|
A pattern consists of one or more `token_specs`, where a `token_spec`
|
|
|
|
is a dictionary mapping attribute IDs to values, and optionally a
|
|
|
|
quantifier operator under the key "op". The available quantifiers are:
|
|
|
|
|
|
|
|
'!': Negate the pattern, by requiring it to match exactly 0 times.
|
|
|
|
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
|
|
|
'+': Require the pattern to match 1 or more times.
|
|
|
|
'*': Allow the pattern to zero or more times.
|
|
|
|
|
|
|
|
The + and * operators are usually interpretted "greedily", i.e. longer
|
|
|
|
matches are returned where possible. However, if you specify two '+'
|
|
|
|
and '*' patterns in a row and their matches overlap, the first
|
2017-10-27 22:07:59 +03:00
|
|
|
operator will behave non-greedily. This quirk in the semantics makes
|
|
|
|
the matcher more efficient, by avoiding the need for back-tracking.
|
2017-10-25 13:10:16 +03:00
|
|
|
|
|
|
|
key (unicode): The match ID.
|
|
|
|
on_match (callable): Callback executed on match.
|
2018-09-05 06:53:21 +03:00
|
|
|
*patterns (list): List of token descriptions.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2017-05-20 14:54:53 +03:00
|
|
|
for pattern in patterns:
|
|
|
|
if len(pattern) == 0:
|
2018-04-03 16:50:31 +03:00
|
|
|
raise ValueError(Errors.E012.format(key=key))
|
2017-05-20 14:54:53 +03:00
|
|
|
key = self._normalize_key(key)
|
|
|
|
for pattern in patterns:
|
|
|
|
specs = _convert_strings(pattern, self.vocab.strings)
|
|
|
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
2018-01-24 17:42:11 +03:00
|
|
|
self._patterns.setdefault(key, [])
|
|
|
|
self._callbacks[key] = on_match
|
|
|
|
self._patterns[key].extend(patterns)
|
2017-05-20 14:54:53 +03:00
|
|
|
|
|
|
|
def remove(self, key):
|
2017-05-20 15:26:10 +03:00
|
|
|
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
|
|
|
not exist.
|
|
|
|
|
|
|
|
key (unicode): The ID of the match rule.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2017-05-20 14:54:53 +03:00
|
|
|
key = self._normalize_key(key)
|
|
|
|
self._patterns.pop(key)
|
|
|
|
self._callbacks.pop(key)
|
|
|
|
cdef int i = 0
|
|
|
|
while i < self.patterns.size():
|
|
|
|
pattern_key = get_pattern_key(self.patterns.at(i))
|
|
|
|
if pattern_key == key:
|
|
|
|
self.patterns.erase(self.patterns.begin()+i)
|
|
|
|
else:
|
|
|
|
i += 1
|
2016-10-17 16:23:31 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def has_key(self, key):
|
|
|
|
"""Check whether the matcher has a rule with a given key.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
key (string or int): The key to check.
|
|
|
|
RETURNS (bool): Whether the matcher has the rule.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2017-05-20 14:54:53 +03:00
|
|
|
key = self._normalize_key(key)
|
|
|
|
return key in self._patterns
|
2016-10-17 16:23:31 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def get(self, key, default=None):
|
2017-05-20 15:43:10 +03:00
|
|
|
"""Retrieve the pattern stored for a key.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
key (unicode or int): The key to retrieve.
|
|
|
|
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2017-05-20 14:54:53 +03:00
|
|
|
key = self._normalize_key(key)
|
|
|
|
if key not in self._patterns:
|
|
|
|
return default
|
|
|
|
return (self._callbacks[key], self._patterns[key])
|
2018-11-14 21:10:21 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def pipe(self, docs, batch_size=1000, n_threads=2):
|
|
|
|
"""Match a stream of documents, yielding them in turn.
|
|
|
|
|
|
|
|
docs (iterable): A stream of documents.
|
2017-10-27 22:07:59 +03:00
|
|
|
batch_size (int): Number of documents to accumulate into a working set.
|
2017-05-20 14:54:53 +03:00
|
|
|
n_threads (int): The number of threads with which to work on the buffer
|
2017-10-27 22:07:59 +03:00
|
|
|
in parallel, if the implementation supports multi-threading.
|
2017-05-20 14:54:53 +03:00
|
|
|
YIELDS (Doc): Documents, in order.
|
|
|
|
"""
|
|
|
|
for doc in docs:
|
|
|
|
self(doc)
|
|
|
|
yield doc
|
2015-08-05 02:05:54 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def __call__(self, Doc doc):
|
2017-10-27 22:07:59 +03:00
|
|
|
"""Find all token sequences matching the supplied pattern.
|
2017-05-19 22:47:06 +03:00
|
|
|
|
|
|
|
doc (Doc): The document to match over.
|
2017-10-07 04:36:15 +03:00
|
|
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
2017-05-19 22:47:06 +03:00
|
|
|
describing the matches. A match tuple describes a span
|
2017-05-20 14:54:53 +03:00
|
|
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2018-03-27 20:23:02 +03:00
|
|
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
|
|
|
|
for i, (key, start, end) in enumerate(matches):
|
|
|
|
on_match = self._callbacks.get(key, None)
|
2016-10-17 16:23:31 +03:00
|
|
|
if on_match is not None:
|
|
|
|
on_match(self, doc, i, matches)
|
2015-08-04 16:55:28 +03:00
|
|
|
return matches
|
2015-10-08 18:00:45 +03:00
|
|
|
|
2017-05-20 14:54:53 +03:00
|
|
|
def _normalize_key(self, key):
|
|
|
|
if isinstance(key, basestring):
|
2017-05-28 16:10:22 +03:00
|
|
|
return self.vocab.strings.add(key)
|
2017-05-20 14:54:53 +03:00
|
|
|
else:
|
|
|
|
return key
|
2016-02-03 04:04:55 +03:00
|
|
|
|
2015-10-08 18:00:45 +03:00
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
def unpickle_matcher(vocab, patterns, callbacks):
|
|
|
|
matcher = Matcher(vocab)
|
|
|
|
for key, specs in patterns.items():
|
|
|
|
callback = callbacks.get(key, None)
|
|
|
|
matcher.add(key, callback, *specs)
|
|
|
|
return matcher
|
|
|
|
|
|
|
|
|
|
|
|
def _get_longest_matches(matches):
|
|
|
|
'''Filter out matches that have a longer equivalent.'''
|
|
|
|
longest_matches = {}
|
|
|
|
for pattern_id, start, end in matches:
|
|
|
|
key = (pattern_id, start)
|
|
|
|
length = end-start
|
|
|
|
if key not in longest_matches or length > longest_matches[key]:
|
|
|
|
longest_matches[key] = length
|
|
|
|
return [(pattern_id, start, start+length)
|
|
|
|
for (pattern_id, start), length in longest_matches.items()]
|
|
|
|
|
|
|
|
|
2016-10-17 16:23:31 +03:00
|
|
|
def get_bilou(length):
|
2018-03-27 20:23:02 +03:00
|
|
|
if length == 0:
|
|
|
|
raise ValueError("Length must be >= 1")
|
|
|
|
elif length == 1:
|
2016-10-17 16:23:31 +03:00
|
|
|
return [U_ENT]
|
|
|
|
elif length == 2:
|
|
|
|
return [B2_ENT, L2_ENT]
|
|
|
|
elif length == 3:
|
|
|
|
return [B3_ENT, I3_ENT, L3_ENT]
|
|
|
|
else:
|
2018-03-27 20:23:02 +03:00
|
|
|
return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
|
2016-10-17 16:23:31 +03:00
|
|
|
|
|
|
|
|
2015-10-08 18:00:45 +03:00
|
|
|
cdef class PhraseMatcher:
|
|
|
|
cdef Pool mem
|
|
|
|
cdef Vocab vocab
|
|
|
|
cdef Matcher matcher
|
|
|
|
cdef PreshMap phrase_ids
|
|
|
|
cdef int max_length
|
2018-11-15 05:00:58 +03:00
|
|
|
cdef attr_id_t attr
|
2017-09-20 23:20:35 +03:00
|
|
|
cdef public object _callbacks
|
2017-09-20 23:26:40 +03:00
|
|
|
cdef public object _patterns
|
2017-09-20 23:20:35 +03:00
|
|
|
|
2018-11-15 05:00:58 +03:00
|
|
|
def __init__(self, Vocab vocab, max_length=0, attr='ORTH'):
|
2018-11-14 21:10:46 +03:00
|
|
|
if max_length != 0:
|
|
|
|
deprecation_warning(Warnings.W010)
|
2015-10-08 18:00:45 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self.max_length = max_length
|
|
|
|
self.vocab = vocab
|
2017-09-20 22:54:31 +03:00
|
|
|
self.matcher = Matcher(self.vocab)
|
2018-11-15 05:00:58 +03:00
|
|
|
if isinstance(attr, long):
|
|
|
|
self.attr = attr
|
|
|
|
else:
|
|
|
|
self.attr = self.vocab.strings[attr]
|
2015-10-08 18:00:45 +03:00
|
|
|
self.phrase_ids = PreshMap()
|
2018-03-27 20:23:02 +03:00
|
|
|
abstract_patterns = [
|
|
|
|
[{U_ENT: True}],
|
|
|
|
[{B2_ENT: True}, {L2_ENT: True}],
|
|
|
|
[{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
|
|
|
|
[{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
|
|
|
|
]
|
2017-09-20 22:54:31 +03:00
|
|
|
self.matcher.add('Candidate', None, *abstract_patterns)
|
2017-09-20 23:20:35 +03:00
|
|
|
self._callbacks = {}
|
2017-09-26 16:35:53 +03:00
|
|
|
|
2017-09-26 14:11:17 +03:00
|
|
|
def __len__(self):
|
2017-10-25 13:09:56 +03:00
|
|
|
"""Get the number of rules added to the matcher. Note that this only
|
|
|
|
returns the number of rules (identical with the number of IDs), not the
|
|
|
|
number of individual patterns.
|
|
|
|
|
|
|
|
RETURNS (int): The number of rules.
|
|
|
|
"""
|
|
|
|
return len(self.phrase_ids)
|
2017-09-26 16:35:53 +03:00
|
|
|
|
|
|
|
def __contains__(self, key):
|
2017-10-25 13:10:04 +03:00
|
|
|
"""Check whether the matcher contains rules for a match ID.
|
|
|
|
|
|
|
|
key (unicode): The match ID.
|
|
|
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
|
|
|
"""
|
|
|
|
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
2017-10-25 17:31:11 +03:00
|
|
|
return ent_id in self._callbacks
|
2015-10-08 18:00:45 +03:00
|
|
|
|
2017-09-20 23:26:40 +03:00
|
|
|
def __reduce__(self):
|
|
|
|
return (self.__class__, (self.vocab,), None, None)
|
|
|
|
|
2017-09-20 23:20:35 +03:00
|
|
|
def add(self, key, on_match, *docs):
|
2018-07-06 13:16:44 +03:00
|
|
|
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
2017-10-27 22:07:59 +03:00
|
|
|
key, an on_match callback, and one or more patterns.
|
2017-10-25 13:10:16 +03:00
|
|
|
|
|
|
|
key (unicode): The match ID.
|
|
|
|
on_match (callable): Callback executed on match.
|
|
|
|
*docs (Doc): `Doc` objects representing match patterns.
|
|
|
|
"""
|
2017-09-20 23:20:35 +03:00
|
|
|
cdef Doc doc
|
|
|
|
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
|
|
|
self._callbacks[ent_id] = on_match
|
|
|
|
cdef int length
|
2015-10-08 18:00:45 +03:00
|
|
|
cdef int i
|
2017-09-20 23:20:35 +03:00
|
|
|
cdef hash_t phrase_hash
|
2018-03-27 20:23:02 +03:00
|
|
|
cdef Pool mem = Pool()
|
2017-09-20 23:20:35 +03:00
|
|
|
for doc in docs:
|
|
|
|
length = doc.length
|
2018-03-27 20:23:02 +03:00
|
|
|
if length == 0:
|
|
|
|
continue
|
2017-09-20 23:20:35 +03:00
|
|
|
tags = get_bilou(length)
|
2018-03-27 20:23:02 +03:00
|
|
|
phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
|
2017-09-20 23:20:35 +03:00
|
|
|
for i, tag in enumerate(tags):
|
2018-11-15 05:00:58 +03:00
|
|
|
attr_value = self.get_lex_value(doc, i)
|
|
|
|
lexeme = self.vocab[attr_value]
|
2017-09-20 23:20:35 +03:00
|
|
|
lexeme.set_flag(tag, True)
|
2018-03-27 20:23:02 +03:00
|
|
|
phrase_key[i] = lexeme.orth
|
|
|
|
phrase_hash = hash64(phrase_key,
|
|
|
|
length * sizeof(attr_t), 0)
|
2017-09-21 00:55:30 +03:00
|
|
|
self.phrase_ids.set(phrase_hash, <void*>ent_id)
|
2015-10-08 18:00:45 +03:00
|
|
|
|
|
|
|
def __call__(self, Doc doc):
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
2017-10-25 13:10:16 +03:00
|
|
|
"""Find all sequences matching the supplied patterns on the `Doc`.
|
|
|
|
|
|
|
|
doc (Doc): The document to match over.
|
|
|
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
|
|
|
describing the matches. A match tuple describes a span
|
|
|
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
|
|
|
"""
|
2017-09-20 23:20:35 +03:00
|
|
|
matches = []
|
2018-11-15 05:00:58 +03:00
|
|
|
if self.attr == ORTH:
|
|
|
|
match_doc = doc
|
|
|
|
else:
|
|
|
|
# If we're not matching on the ORTH, match_doc will be a Doc whose
|
|
|
|
# token.orth values are the attribute values we're matching on,
|
|
|
|
# e.g. Doc(nlp.vocab, words=[token.pos_ for token in doc])
|
|
|
|
words = [self.get_lex_value(doc, i) for i in range(len(doc))]
|
|
|
|
match_doc = Doc(self.vocab, words=words)
|
|
|
|
for _, start, end in self.matcher(match_doc):
|
|
|
|
ent_id = self.accept_match(match_doc, start, end)
|
2017-09-20 23:20:35 +03:00
|
|
|
if ent_id is not None:
|
|
|
|
matches.append((ent_id, start, end))
|
|
|
|
for i, (ent_id, start, end) in enumerate(matches):
|
|
|
|
on_match = self._callbacks.get(ent_id)
|
|
|
|
if on_match is not None:
|
|
|
|
on_match(self, doc, i, matches)
|
|
|
|
return matches
|
2015-10-08 18:00:45 +03:00
|
|
|
|
2018-07-06 13:16:44 +03:00
|
|
|
def pipe(self, stream, batch_size=1000, n_threads=1, return_matches=False,
|
2018-03-27 20:23:02 +03:00
|
|
|
as_tuples=False):
|
2017-10-25 13:10:16 +03:00
|
|
|
"""Match a stream of documents, yielding them in turn.
|
|
|
|
|
|
|
|
docs (iterable): A stream of documents.
|
2017-10-27 22:07:59 +03:00
|
|
|
batch_size (int): Number of documents to accumulate into a working set.
|
2017-10-25 13:10:16 +03:00
|
|
|
n_threads (int): The number of threads with which to work on the buffer
|
2017-10-27 22:07:59 +03:00
|
|
|
in parallel, if the implementation supports multi-threading.
|
2018-03-27 20:23:02 +03:00
|
|
|
return_matches (bool): Yield the match lists along with the docs, making
|
|
|
|
results (doc, matches) tuples.
|
|
|
|
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
|
|
|
and yield (result, context) tuples out.
|
|
|
|
If both return_matches and as_tuples are True, the output will
|
|
|
|
be a sequence of ((doc, matches), context) tuples.
|
2017-10-25 13:10:16 +03:00
|
|
|
YIELDS (Doc): Documents, in order.
|
|
|
|
"""
|
2018-03-27 20:23:02 +03:00
|
|
|
if as_tuples:
|
|
|
|
for doc, context in stream:
|
|
|
|
matches = self(doc)
|
|
|
|
if return_matches:
|
|
|
|
yield ((doc, matches), context)
|
|
|
|
else:
|
|
|
|
yield (doc, context)
|
|
|
|
else:
|
|
|
|
for doc in stream:
|
|
|
|
matches = self(doc)
|
|
|
|
if return_matches:
|
2018-11-14 21:12:34 +03:00
|
|
|
yield (doc, matches)
|
2018-03-27 20:23:02 +03:00
|
|
|
else:
|
|
|
|
yield doc
|
2016-02-03 04:04:55 +03:00
|
|
|
|
2017-09-20 23:20:35 +03:00
|
|
|
def accept_match(self, Doc doc, int start, int end):
|
2015-10-08 18:00:45 +03:00
|
|
|
cdef int i, j
|
2018-03-27 20:23:02 +03:00
|
|
|
cdef Pool mem = Pool()
|
|
|
|
phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
|
2015-10-08 18:00:45 +03:00
|
|
|
for i, j in enumerate(range(start, end)):
|
2018-03-27 20:23:02 +03:00
|
|
|
phrase_key[i] = doc.c[j].lex.orth
|
|
|
|
cdef hash_t key = hash64(phrase_key,
|
|
|
|
(end-start) * sizeof(attr_t), 0)
|
2017-09-20 23:20:35 +03:00
|
|
|
ent_id = <hash_t>self.phrase_ids.get(key)
|
|
|
|
if ent_id == 0:
|
|
|
|
return None
|
2015-10-08 18:00:45 +03:00
|
|
|
else:
|
2017-09-20 23:20:35 +03:00
|
|
|
return ent_id
|
2018-09-05 06:53:21 +03:00
|
|
|
|
2018-11-15 05:00:58 +03:00
|
|
|
def get_lex_value(self, Doc doc, int i):
|
|
|
|
if self.attr == ORTH:
|
|
|
|
# Return the regular orth value of the lexeme
|
|
|
|
return doc.c[i].lex.orth
|
|
|
|
# Get the attribute value instead, e.g. token.pos
|
|
|
|
attr_value = get_token_attr(&doc.c[i], self.attr)
|
|
|
|
if attr_value in (0, 1):
|
|
|
|
# Value is boolean, convert to string
|
|
|
|
string_attr_value = str(attr_value)
|
|
|
|
else:
|
|
|
|
string_attr_value = self.vocab.strings[attr_value]
|
|
|
|
string_attr_name = self.vocab.strings[self.attr]
|
|
|
|
# Concatenate the attr name and value to not pollute lexeme space
|
|
|
|
# e.g. 'POS-VERB' instead of just 'VERB', which could otherwise
|
|
|
|
# create false positive matches
|
|
|
|
return 'matcher:{}-{}'.format(string_attr_name, string_attr_value)
|
|
|
|
|
2018-09-05 06:53:21 +03:00
|
|
|
|
|
|
|
cdef class DependencyTreeMatcher:
|
|
|
|
"""Match dependency parse tree based on pattern rules."""
|
|
|
|
cdef Pool mem
|
|
|
|
cdef readonly Vocab vocab
|
|
|
|
cdef readonly Matcher token_matcher
|
|
|
|
cdef public object _patterns
|
|
|
|
cdef public object _keys_to_token
|
|
|
|
cdef public object _root
|
|
|
|
cdef public object _entities
|
|
|
|
cdef public object _callbacks
|
|
|
|
cdef public object _nodes
|
|
|
|
cdef public object _tree
|
|
|
|
|
|
|
|
def __init__(self, vocab):
|
|
|
|
"""Create the DependencyTreeMatcher.
|
|
|
|
|
|
|
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
|
|
|
documents the matcher will operate on.
|
|
|
|
RETURNS (DependencyTreeMatcher): The newly constructed object.
|
|
|
|
"""
|
|
|
|
size = 20
|
|
|
|
self.token_matcher = Matcher(vocab)
|
|
|
|
self._keys_to_token = {}
|
|
|
|
self._patterns = {}
|
|
|
|
self._root = {}
|
|
|
|
self._nodes = {}
|
|
|
|
self._tree = {}
|
|
|
|
self._entities = {}
|
|
|
|
self._callbacks = {}
|
|
|
|
self.vocab = vocab
|
|
|
|
self.mem = Pool()
|
|
|
|
|
|
|
|
def __reduce__(self):
|
|
|
|
data = (self.vocab, self._patterns,self._tree, self._callbacks)
|
|
|
|
return (unpickle_matcher, data, None, None)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""Get the number of rules, which are edges ,added to the dependency tree matcher.
|
|
|
|
|
|
|
|
RETURNS (int): The number of rules.
|
|
|
|
"""
|
|
|
|
return len(self._patterns)
|
|
|
|
|
|
|
|
def __contains__(self, key):
|
|
|
|
"""Check whether the matcher contains rules for a match ID.
|
|
|
|
|
|
|
|
key (unicode): The match ID.
|
|
|
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
|
|
|
"""
|
|
|
|
return self._normalize_key(key) in self._patterns
|
|
|
|
|
2018-10-30 01:21:39 +03:00
|
|
|
def validateInput(self, pattern, key):
|
|
|
|
idx = 0
|
|
|
|
visitedNodes = {}
|
|
|
|
for relation in pattern:
|
|
|
|
if 'PATTERN' not in relation or 'SPEC' not in relation:
|
|
|
|
raise ValueError(Errors.E098.format(key=key))
|
|
|
|
if idx == 0:
|
|
|
|
if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' not in relation['SPEC'] and 'NBOR_NAME' not in relation['SPEC']):
|
|
|
|
raise ValueError(Errors.E099.format(key=key))
|
|
|
|
visitedNodes[relation['SPEC']['NODE_NAME']] = True
|
|
|
|
else:
|
|
|
|
if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' in relation['SPEC'] and 'NBOR_NAME' in relation['SPEC']):
|
|
|
|
raise ValueError(Errors.E100.format(key=key))
|
|
|
|
if relation['SPEC']['NODE_NAME'] in visitedNodes or relation['SPEC']['NBOR_NAME'] not in visitedNodes:
|
|
|
|
raise ValueError(Errors.E101.format(key=key))
|
|
|
|
visitedNodes[relation['SPEC']['NODE_NAME']] = True
|
|
|
|
visitedNodes[relation['SPEC']['NBOR_NAME']] = True
|
|
|
|
idx = idx + 1
|
2018-09-05 06:53:21 +03:00
|
|
|
|
|
|
|
def add(self, key, on_match, *patterns):
|
|
|
|
for pattern in patterns:
|
|
|
|
if len(pattern) == 0:
|
|
|
|
raise ValueError(Errors.E012.format(key=key))
|
2018-10-30 01:21:39 +03:00
|
|
|
self.validateInput(pattern,key)
|
2018-09-05 06:53:21 +03:00
|
|
|
|
|
|
|
key = self._normalize_key(key)
|
|
|
|
|
|
|
|
_patterns = []
|
|
|
|
for pattern in patterns:
|
|
|
|
token_patterns = []
|
|
|
|
for i in range(len(pattern)):
|
|
|
|
token_pattern = [pattern[i]['PATTERN']]
|
|
|
|
token_patterns.append(token_pattern)
|
|
|
|
# self.patterns.append(token_patterns)
|
|
|
|
_patterns.append(token_patterns)
|
|
|
|
|
|
|
|
self._patterns.setdefault(key, [])
|
|
|
|
self._callbacks[key] = on_match
|
|
|
|
self._patterns[key].extend(_patterns)
|
|
|
|
|
|
|
|
# Add each node pattern of all the input patterns individually to the matcher.
|
|
|
|
# This enables only a single instance of Matcher to be used.
|
|
|
|
# Multiple adds are required to track each node pattern.
|
|
|
|
_keys_to_token_list = []
|
|
|
|
for i in range(len(_patterns)):
|
|
|
|
_keys_to_token = {}
|
|
|
|
# TODO : Better ways to hash edges in pattern?
|
|
|
|
for j in range(len(_patterns[i])):
|
|
|
|
k = self._normalize_key(unicode(key)+DELIMITER+unicode(i)+DELIMITER+unicode(j))
|
|
|
|
self.token_matcher.add(k,None,_patterns[i][j])
|
|
|
|
_keys_to_token[k] = j
|
|
|
|
_keys_to_token_list.append(_keys_to_token)
|
|
|
|
|
|
|
|
self._keys_to_token.setdefault(key, [])
|
|
|
|
self._keys_to_token[key].extend(_keys_to_token_list)
|
|
|
|
|
|
|
|
_nodes_list = []
|
|
|
|
for pattern in patterns:
|
|
|
|
nodes = {}
|
|
|
|
for i in range(len(pattern)):
|
|
|
|
nodes[pattern[i]['SPEC']['NODE_NAME']]=i
|
|
|
|
_nodes_list.append(nodes)
|
|
|
|
|
|
|
|
self._nodes.setdefault(key, [])
|
|
|
|
self._nodes[key].extend(_nodes_list)
|
|
|
|
|
|
|
|
# Create an object tree to traverse later on.
|
|
|
|
# This datastructure enable easy tree pattern match.
|
|
|
|
# Doc-Token based tree cannot be reused since it is memory heavy and
|
|
|
|
# tightly coupled with doc
|
|
|
|
self.retrieve_tree(patterns,_nodes_list,key)
|
|
|
|
|
|
|
|
def retrieve_tree(self,patterns,_nodes_list,key):
|
|
|
|
_heads_list = []
|
|
|
|
_root_list = []
|
|
|
|
for i in range(len(patterns)):
|
|
|
|
heads = {}
|
|
|
|
root = -1
|
|
|
|
for j in range(len(patterns[i])):
|
|
|
|
token_pattern = patterns[i][j]
|
|
|
|
if('NBOR_RELOP' not in token_pattern['SPEC']):
|
2018-10-30 01:21:39 +03:00
|
|
|
heads[j] = ('root',j)
|
2018-09-05 06:53:21 +03:00
|
|
|
root = j
|
|
|
|
else:
|
2018-10-30 01:21:39 +03:00
|
|
|
heads[j] = (token_pattern['SPEC']['NBOR_RELOP'],_nodes_list[i][token_pattern['SPEC']['NBOR_NAME']])
|
2018-09-05 06:53:21 +03:00
|
|
|
|
|
|
|
_heads_list.append(heads)
|
|
|
|
_root_list.append(root)
|
|
|
|
|
|
|
|
_tree_list = []
|
|
|
|
for i in range(len(patterns)):
|
|
|
|
tree = {}
|
|
|
|
for j in range(len(patterns[i])):
|
2018-10-30 01:21:39 +03:00
|
|
|
if(_heads_list[i][j][INDEX_HEAD] == j):
|
2018-09-05 06:53:21 +03:00
|
|
|
continue
|
2018-10-30 01:21:39 +03:00
|
|
|
|
|
|
|
head = _heads_list[i][j][INDEX_HEAD]
|
2018-09-05 06:53:21 +03:00
|
|
|
if(head not in tree):
|
|
|
|
tree[head] = []
|
2018-10-30 01:21:39 +03:00
|
|
|
tree[head].append( (_heads_list[i][j][INDEX_RELOP],j) )
|
2018-09-05 06:53:21 +03:00
|
|
|
_tree_list.append(tree)
|
|
|
|
|
|
|
|
self._tree.setdefault(key, [])
|
|
|
|
self._tree[key].extend(_tree_list)
|
|
|
|
|
|
|
|
self._root.setdefault(key, [])
|
|
|
|
self._root[key].extend(_root_list)
|
|
|
|
|
|
|
|
def has_key(self, key):
|
|
|
|
"""Check whether the matcher has a rule with a given key.
|
|
|
|
|
|
|
|
key (string or int): The key to check.
|
|
|
|
RETURNS (bool): Whether the matcher has the rule.
|
|
|
|
"""
|
|
|
|
key = self._normalize_key(key)
|
|
|
|
return key in self._patterns
|
|
|
|
|
|
|
|
def get(self, key, default=None):
|
|
|
|
"""Retrieve the pattern stored for a key.
|
|
|
|
|
|
|
|
key (unicode or int): The key to retrieve.
|
|
|
|
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
|
|
|
"""
|
|
|
|
key = self._normalize_key(key)
|
|
|
|
if key not in self._patterns:
|
|
|
|
return default
|
|
|
|
return (self._callbacks[key], self._patterns[key])
|
|
|
|
|
|
|
|
def __call__(self, Doc doc):
|
|
|
|
matched_trees = []
|
|
|
|
|
|
|
|
matches = self.token_matcher(doc)
|
|
|
|
for key in list(self._patterns.keys()):
|
|
|
|
_patterns_list = self._patterns[key]
|
|
|
|
_keys_to_token_list = self._keys_to_token[key]
|
|
|
|
_root_list = self._root[key]
|
|
|
|
_tree_list = self._tree[key]
|
|
|
|
_nodes_list = self._nodes[key]
|
|
|
|
length = len(_patterns_list)
|
|
|
|
for i in range(length):
|
|
|
|
_keys_to_token = _keys_to_token_list[i]
|
|
|
|
_root = _root_list[i]
|
|
|
|
_tree = _tree_list[i]
|
|
|
|
_nodes = _nodes_list[i]
|
|
|
|
id_to_position = {}
|
2018-10-30 01:21:39 +03:00
|
|
|
for i in range(len(_nodes)):
|
|
|
|
id_to_position[i]=[]
|
2018-09-05 06:53:21 +03:00
|
|
|
|
|
|
|
# This could be taken outside to improve running time..?
|
|
|
|
for match_id, start, end in matches:
|
|
|
|
if match_id in _keys_to_token:
|
|
|
|
id_to_position[_keys_to_token[match_id]].append(start)
|
|
|
|
|
2018-10-30 01:21:39 +03:00
|
|
|
_node_operator_map = self.get_node_operator_map(doc,_tree,id_to_position,_nodes,_root)
|
2018-09-05 06:53:21 +03:00
|
|
|
length = len(_nodes)
|
|
|
|
if _root in id_to_position:
|
|
|
|
candidates = id_to_position[_root]
|
|
|
|
for candidate in candidates:
|
|
|
|
isVisited = {}
|
2018-10-30 01:21:39 +03:00
|
|
|
self.dfs(candidate,_root,_tree,id_to_position,doc,isVisited,_node_operator_map)
|
|
|
|
# To check if the subtree pattern is completely identified. This is a heuristic.
|
|
|
|
# This is done to reduce the complexity of exponential unordered subtree matching.
|
|
|
|
# Will give approximate matches in some cases.
|
2018-09-05 06:53:21 +03:00
|
|
|
if(len(isVisited) == length):
|
|
|
|
matched_trees.append((key,list(isVisited)))
|
|
|
|
|
|
|
|
for i, (ent_id, nodes) in enumerate(matched_trees):
|
|
|
|
on_match = self._callbacks.get(ent_id)
|
|
|
|
if on_match is not None:
|
|
|
|
on_match(self, doc, i, matches)
|
|
|
|
|
|
|
|
return matched_trees
|
|
|
|
|
2018-10-30 01:21:39 +03:00
|
|
|
def dfs(self,candidate,root,tree,id_to_position,doc,isVisited,_node_operator_map):
|
2018-09-05 06:53:21 +03:00
|
|
|
if(root in id_to_position and candidate in id_to_position[root]):
|
|
|
|
# color the node since it is valid
|
|
|
|
isVisited[candidate] = True
|
2018-10-30 01:21:39 +03:00
|
|
|
if root in tree:
|
|
|
|
for root_child in tree[root]:
|
|
|
|
if candidate in _node_operator_map and root_child[INDEX_RELOP] in _node_operator_map[candidate]:
|
|
|
|
candidate_children = _node_operator_map[candidate][root_child[INDEX_RELOP]]
|
|
|
|
for candidate_child in candidate_children:
|
|
|
|
result = self.dfs(
|
|
|
|
candidate_child.i,
|
|
|
|
root_child[INDEX_HEAD],
|
|
|
|
tree,
|
|
|
|
id_to_position,
|
|
|
|
doc,
|
|
|
|
isVisited,
|
|
|
|
_node_operator_map
|
|
|
|
)
|
|
|
|
|
|
|
|
# Given a node and an edge operator, to return the list of nodes
|
|
|
|
# from the doc that belong to node+operator. This is used to store
|
|
|
|
# all the results beforehand to prevent unnecessary computation while
|
|
|
|
# pattern matching
|
|
|
|
# _node_operator_map[node][operator] = [...]
|
|
|
|
def get_node_operator_map(self,doc,tree,id_to_position,nodes,root):
|
|
|
|
_node_operator_map = {}
|
|
|
|
all_node_indices = nodes.values()
|
|
|
|
all_operators = []
|
|
|
|
for node in all_node_indices:
|
|
|
|
if node in tree:
|
|
|
|
for child in tree[node]:
|
|
|
|
all_operators.append(child[INDEX_RELOP])
|
|
|
|
all_operators = list(set(all_operators))
|
|
|
|
|
|
|
|
all_nodes = []
|
|
|
|
for node in all_node_indices:
|
|
|
|
all_nodes = all_nodes + id_to_position[node]
|
|
|
|
all_nodes = list(set(all_nodes))
|
|
|
|
|
|
|
|
for node in all_nodes:
|
|
|
|
_node_operator_map[node] = {}
|
|
|
|
for operator in all_operators:
|
|
|
|
_node_operator_map[node][operator] = []
|
|
|
|
|
|
|
|
# Used to invoke methods for each operator
|
|
|
|
switcher = {
|
|
|
|
'<':self.dep,
|
|
|
|
'>':self.gov,
|
|
|
|
'>>':self.dep_chain,
|
|
|
|
'<<':self.gov_chain,
|
|
|
|
'.':self.imm_precede,
|
|
|
|
'$+':self.imm_right_sib,
|
|
|
|
'$-':self.imm_left_sib,
|
|
|
|
'$++':self.right_sib,
|
|
|
|
'$--':self.left_sib
|
|
|
|
}
|
|
|
|
for operator in all_operators:
|
|
|
|
for node in all_nodes:
|
|
|
|
_node_operator_map[node][operator] = switcher.get(operator)(doc,node)
|
|
|
|
|
|
|
|
return _node_operator_map
|
|
|
|
|
|
|
|
def dep(self,doc,node):
|
|
|
|
return list(doc[node].head)
|
|
|
|
|
|
|
|
def gov(self,doc,node):
|
|
|
|
return list(doc[node].children)
|
|
|
|
|
|
|
|
def dep_chain(self,doc,node):
|
|
|
|
return list(doc[node].ancestors)
|
|
|
|
|
|
|
|
def gov_chain(self,doc,node):
|
|
|
|
return list(doc[node].subtree)
|
|
|
|
|
|
|
|
def imm_precede(self,doc,node):
|
|
|
|
if node>0:
|
|
|
|
return [doc[node-1]]
|
|
|
|
return []
|
|
|
|
|
|
|
|
def imm_right_sib(self,doc,node):
|
|
|
|
for idx in range(list(doc[node].head.children)):
|
|
|
|
if idx == node-1:
|
|
|
|
return [doc[idx]]
|
|
|
|
return []
|
|
|
|
|
|
|
|
def imm_left_sib(self,doc,node):
|
|
|
|
for idx in range(list(doc[node].head.children)):
|
|
|
|
if idx == node+1:
|
|
|
|
return [doc[idx]]
|
|
|
|
return []
|
|
|
|
|
|
|
|
def right_sib(self,doc,node):
|
|
|
|
candidate_children = []
|
|
|
|
for idx in range(list(doc[node].head.children)):
|
|
|
|
if idx < node:
|
|
|
|
candidate_children.append(doc[idx])
|
|
|
|
return candidate_children
|
|
|
|
|
|
|
|
def left_sib(self,doc,node):
|
|
|
|
candidate_children = []
|
|
|
|
for idx in range(list(doc[node].head.children)):
|
|
|
|
if idx > node:
|
|
|
|
candidate_children.append(doc[idx])
|
|
|
|
return candidate_children
|
2018-09-05 06:53:21 +03:00
|
|
|
|
|
|
|
def _normalize_key(self, key):
|
|
|
|
if isinstance(key, basestring):
|
|
|
|
return self.vocab.strings.add(key)
|
|
|
|
else:
|
|
|
|
return key
|