mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						073e8d647c
					
				| 
						 | 
					@ -8,8 +8,6 @@ from spacy.kb import KnowledgeBase
 | 
				
			||||||
import csv
 | 
					import csv
 | 
				
			||||||
import datetime
 | 
					import datetime
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import Errors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_kb(
 | 
					def create_kb(
 | 
				
			||||||
    nlp,
 | 
					    nlp,
 | 
				
			||||||
| 
						 | 
					@ -33,7 +31,10 @@ def create_kb(
 | 
				
			||||||
        input_dim = nlp.vocab.vectors_length
 | 
					        input_dim = nlp.vocab.vectors_length
 | 
				
			||||||
        print("Loaded pre-trained vectors of size %s" % input_dim)
 | 
					        print("Loaded pre-trained vectors of size %s" % input_dim)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        raise ValueError(Errors.E155)
 | 
					        raise ValueError(
 | 
				
			||||||
 | 
					            "The `nlp` object should have access to pre-trained word vectors, "
 | 
				
			||||||
 | 
					            " cf. https://spacy.io/usage/models#languages."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
 | 
					    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
 | 
				
			||||||
    if read_raw_data:
 | 
					    if read_raw_data:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -73,7 +73,10 @@ def main(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # check the length of the nlp vectors
 | 
					    # check the length of the nlp vectors
 | 
				
			||||||
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
 | 
					    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
 | 
				
			||||||
        raise ValueError(Errors.E155)
 | 
					        raise ValueError(
 | 
				
			||||||
 | 
					            "The `nlp` object should have access to pre-trained word vectors, "
 | 
				
			||||||
 | 
					            " cf. https://spacy.io/usage/models#languages."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # STEP 2: create prior probabilities from WP
 | 
					    # STEP 2: create prior probabilities from WP
 | 
				
			||||||
    print()
 | 
					    print()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,8 +19,6 @@ from bin.wiki_entity_linking import training_set_creator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.kb import KnowledgeBase
 | 
					from spacy.kb import KnowledgeBase
 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy import Errors
 | 
					 | 
				
			||||||
from spacy.util import minibatch, compounding
 | 
					from spacy.util import minibatch, compounding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -68,7 +66,7 @@ def main(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # check that there is a NER component in the pipeline
 | 
					    # check that there is a NER component in the pipeline
 | 
				
			||||||
    if "ner" not in nlp.pipe_names:
 | 
					    if "ner" not in nlp.pipe_names:
 | 
				
			||||||
        raise ValueError(Errors.E152)
 | 
					        raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # STEP 2 : read the KB
 | 
					    # STEP 2 : read the KB
 | 
				
			||||||
    print()
 | 
					    print()
 | 
				
			||||||
| 
						 | 
					@ -82,7 +80,10 @@ def main(
 | 
				
			||||||
        print(now(), "STEP 3: reading training dataset from", loc_training)
 | 
					        print(now(), "STEP 3: reading training dataset from", loc_training)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        if not wp_xml:
 | 
					        if not wp_xml:
 | 
				
			||||||
            raise ValueError(Errors.E153)
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                "Either provide a path to a preprocessed training directory, "
 | 
				
			||||||
 | 
					                "or to the original Wikipedia XML dump."
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if output_dir:
 | 
					        if output_dir:
 | 
				
			||||||
            loc_training = output_dir / "training_data"
 | 
					            loc_training = output_dir / "training_data"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,12 +17,10 @@ import plac
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.kb import KnowledgeBase
 | 
					from spacy.kb import KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
 | 
					from bin.wiki_entity_linking.train_descriptions import EntityEncoder
 | 
				
			||||||
from spacy import Errors
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Q2146908 (Russ Cochran): American golfer
 | 
					# Q2146908 (Russ Cochran): American golfer
 | 
				
			||||||
| 
						 | 
					@ -45,7 +43,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
 | 
				
			||||||
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
 | 
					    If an output_dir is provided, the KB will be stored there in a file 'kb'.
 | 
				
			||||||
    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
 | 
					    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
 | 
				
			||||||
    if model is None and vocab_path is None:
 | 
					    if model is None and vocab_path is None:
 | 
				
			||||||
        raise ValueError(Errors.E154)
 | 
					        raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if model is not None:
 | 
					    if model is not None:
 | 
				
			||||||
        nlp = spacy.load(model)  # load existing spaCy model
 | 
					        nlp = spacy.load(model)  # load existing spaCy model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,8 +22,6 @@ from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.kb import KnowledgeBase
 | 
					from spacy.kb import KnowledgeBase
 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy import Errors
 | 
					 | 
				
			||||||
from spacy.tokens import Span
 | 
					from spacy.tokens import Span
 | 
				
			||||||
from spacy.util import minibatch, compounding
 | 
					from spacy.util import minibatch, compounding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -128,7 +128,7 @@ class DependencyRenderer(object):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if start < 0 or end < 0:
 | 
					        if start < 0 or end < 0:
 | 
				
			||||||
            error_args = dict(start=start, end=end, label=label, dir=direction)
 | 
					            error_args = dict(start=start, end=end, label=label, dir=direction)
 | 
				
			||||||
            raise ValueError(Errors.E156.format(**error_args))
 | 
					            raise ValueError(Errors.E157.format(**error_args))
 | 
				
			||||||
        level = self.levels.index(end - start) + 1
 | 
					        level = self.levels.index(end - start) + 1
 | 
				
			||||||
        x_start = self.offset_x + start * self.distance + self.arrow_spacing
 | 
					        x_start = self.offset_x + start * self.distance + self.arrow_spacing
 | 
				
			||||||
        if self.direction == "rtl":
 | 
					        if self.direction == "rtl":
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -431,13 +431,24 @@ class Errors(object):
 | 
				
			||||||
            "same, but found '{nlp}' and '{vocab}' respectively.")
 | 
					            "same, but found '{nlp}' and '{vocab}' respectively.")
 | 
				
			||||||
    E151 = ("Trying to call nlp.update without required annotation types. "
 | 
					    E151 = ("Trying to call nlp.update without required annotation types. "
 | 
				
			||||||
            "Expected top-level keys: {exp}. Got: {unexp}.")
 | 
					            "Expected top-level keys: {exp}. Got: {unexp}.")
 | 
				
			||||||
    E152 = ("The `nlp` object should have a pre-trained `ner` component.")
 | 
					    E152 = ("The attribute {attr} is not supported for token patterns. "
 | 
				
			||||||
    E153 = ("Either provide a path to a preprocessed training directory, "
 | 
					            "Please use the option validate=True with Matcher, PhraseMatcher, "
 | 
				
			||||||
            "or to the original Wikipedia XML dump.")
 | 
					            "or EntityRuler for more details.")
 | 
				
			||||||
    E154 = ("Either the `nlp` model or the `vocab` should be specified.")
 | 
					    E153 = ("The value type {vtype} is not supported for token patterns. "
 | 
				
			||||||
    E155 = ("The `nlp` object should have access to pre-trained word vectors, "
 | 
					            "Please use the option validate=True with Matcher, PhraseMatcher, "
 | 
				
			||||||
            " cf. https://spacy.io/usage/models#languages.")
 | 
					            "or EntityRuler for more details.")
 | 
				
			||||||
    E156 = ("Can't render negative values for dependency arc start or end. "
 | 
					    E154 = ("One of the attributes or values is not supported for token "
 | 
				
			||||||
 | 
					            "patterns. Please use the option validate=True with Matcher, "
 | 
				
			||||||
 | 
					            "PhraseMatcher, or EntityRuler for more details.")
 | 
				
			||||||
 | 
					    E155 = ("The pipeline needs to include a tagger in order to use "
 | 
				
			||||||
 | 
					            "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
 | 
				
			||||||
 | 
					            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
 | 
				
			||||||
 | 
					            "instead of list(nlp.tokenizer.pipe()).")
 | 
				
			||||||
 | 
					    E156 = ("The pipeline needs to include a parser in order to use "
 | 
				
			||||||
 | 
					            "Matcher or PhraseMatcher with the attribute DEP. Try using "
 | 
				
			||||||
 | 
					            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
 | 
				
			||||||
 | 
					            "list(nlp.tokenizer.pipe()).")
 | 
				
			||||||
 | 
					    E157 = ("Can't render negative values for dependency arc start or end. "
 | 
				
			||||||
            "Make sure that you're passing in absolute token indices, not "
 | 
					            "Make sure that you're passing in absolute token indices, not "
 | 
				
			||||||
            "relative token offsets.\nstart: {start}, end: {end}, label: "
 | 
					            "relative token offsets.\nstart: {start}, end: {end}, label: "
 | 
				
			||||||
            "{label}, direction: {dir}")
 | 
					            "{label}, direction: {dir}")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +12,7 @@ from ...util import update_exc, add_lookups
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SerbianDefaults(Language.Defaults):
 | 
					class SerbianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "rs"
 | 
					    lex_attr_getters[LANG] = lambda text: "sr"
 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					    lex_attr_getters[NORM] = add_lookups(
 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					@ -21,7 +21,7 @@ class SerbianDefaults(Language.Defaults):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Serbian(Language):
 | 
					class Serbian(Language):
 | 
				
			||||||
    lang = "rs"
 | 
					    lang = "sr"
 | 
				
			||||||
    Defaults = SerbianDefaults
 | 
					    Defaults = SerbianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -102,6 +102,10 @@ TOKEN_PATTERN_SCHEMA = {
 | 
				
			||||||
                "title": "Entity label of single token",
 | 
					                "title": "Entity label of single token",
 | 
				
			||||||
                "$ref": "#/definitions/string_value",
 | 
					                "$ref": "#/definitions/string_value",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
 | 
					            "NORM": {
 | 
				
			||||||
 | 
					                "title": "Normalized form of the token text",
 | 
				
			||||||
 | 
					                "$ref": "#/definitions/string_value",
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
            "LENGTH": {
 | 
					            "LENGTH": {
 | 
				
			||||||
                "title": "Token character length",
 | 
					                "title": "Token character length",
 | 
				
			||||||
                "$ref": "#/definitions/integer_value",
 | 
					                "$ref": "#/definitions/integer_value",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -67,3 +67,4 @@ cdef class Matcher:
 | 
				
			||||||
    cdef public object _callbacks
 | 
					    cdef public object _callbacks
 | 
				
			||||||
    cdef public object _extensions
 | 
					    cdef public object _extensions
 | 
				
			||||||
    cdef public object _extra_predicates
 | 
					    cdef public object _extra_predicates
 | 
				
			||||||
 | 
					    cdef public object _seen_attrs
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,7 @@ from ..structs cimport TokenC
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
from ..tokens.doc cimport Doc, get_token_attr
 | 
					from ..tokens.doc cimport Doc, get_token_attr
 | 
				
			||||||
from ..tokens.token cimport Token
 | 
					from ..tokens.token cimport Token
 | 
				
			||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH
 | 
					from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._schemas import TOKEN_PATTERN_SCHEMA
 | 
					from ._schemas import TOKEN_PATTERN_SCHEMA
 | 
				
			||||||
from ..util import get_json_validator, validate_json
 | 
					from ..util import get_json_validator, validate_json
 | 
				
			||||||
| 
						 | 
					@ -45,7 +45,7 @@ cdef class Matcher:
 | 
				
			||||||
        self._patterns = {}
 | 
					        self._patterns = {}
 | 
				
			||||||
        self._callbacks = {}
 | 
					        self._callbacks = {}
 | 
				
			||||||
        self._extensions = {}
 | 
					        self._extensions = {}
 | 
				
			||||||
        self._extra_predicates = []
 | 
					        self._seen_attrs = set()
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        if validate:
 | 
					        if validate:
 | 
				
			||||||
| 
						 | 
					@ -112,9 +112,15 @@ cdef class Matcher:
 | 
				
			||||||
            raise MatchPatternError(key, errors)
 | 
					            raise MatchPatternError(key, errors)
 | 
				
			||||||
        key = self._normalize_key(key)
 | 
					        key = self._normalize_key(key)
 | 
				
			||||||
        for pattern in patterns:
 | 
					        for pattern in patterns:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
                specs = _preprocess_pattern(pattern, self.vocab.strings,
 | 
					                specs = _preprocess_pattern(pattern, self.vocab.strings,
 | 
				
			||||||
                    self._extensions, self._extra_predicates)
 | 
					                    self._extensions, self._extra_predicates)
 | 
				
			||||||
                self.patterns.push_back(init_pattern(self.mem, key, specs))
 | 
					                self.patterns.push_back(init_pattern(self.mem, key, specs))
 | 
				
			||||||
 | 
					                for spec in specs:
 | 
				
			||||||
 | 
					                    for attr, _ in spec[1]:
 | 
				
			||||||
 | 
					                        self._seen_attrs.add(attr)
 | 
				
			||||||
 | 
					            except OverflowError, AttributeError:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E154.format())
 | 
				
			||||||
        self._patterns.setdefault(key, [])
 | 
					        self._patterns.setdefault(key, [])
 | 
				
			||||||
        self._callbacks[key] = on_match
 | 
					        self._callbacks[key] = on_match
 | 
				
			||||||
        self._patterns[key].extend(patterns)
 | 
					        self._patterns[key].extend(patterns)
 | 
				
			||||||
| 
						 | 
					@ -177,6 +183,11 @@ cdef class Matcher:
 | 
				
			||||||
            describing the matches. A match tuple describes a span
 | 
					            describing the matches. A match tuple describes a span
 | 
				
			||||||
            `doc[start:end]`. The `label_id` and `key` are both integers.
 | 
					            `doc[start:end]`. The `label_id` and `key` are both integers.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
 | 
				
			||||||
 | 
					          and not doc.is_tagged:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E155.format())
 | 
				
			||||||
 | 
					        if DEP in self._seen_attrs and not doc.is_parsed:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E156.format())
 | 
				
			||||||
        matches = find_matches(&self.patterns[0], self.patterns.size(), doc,
 | 
					        matches = find_matches(&self.patterns[0], self.patterns.size(), doc,
 | 
				
			||||||
                               extensions=self._extensions,
 | 
					                               extensions=self._extensions,
 | 
				
			||||||
                               predicates=self._extra_predicates)
 | 
					                               predicates=self._extra_predicates)
 | 
				
			||||||
| 
						 | 
					@ -568,6 +579,8 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
 | 
				
			||||||
            # Signifier for 'any token'
 | 
					            # Signifier for 'any token'
 | 
				
			||||||
            tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
 | 
					            tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					        if not isinstance(spec, dict):
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E154.format())
 | 
				
			||||||
        ops = _get_operators(spec)
 | 
					        ops = _get_operators(spec)
 | 
				
			||||||
        attr_values = _get_attr_values(spec, string_store)
 | 
					        attr_values = _get_attr_values(spec, string_store)
 | 
				
			||||||
        extensions = _get_extensions(spec, string_store, extensions_table)
 | 
					        extensions = _get_extensions(spec, string_store, extensions_table)
 | 
				
			||||||
| 
						 | 
					@ -581,21 +594,29 @@ def _get_attr_values(spec, string_store):
 | 
				
			||||||
    attr_values = []
 | 
					    attr_values = []
 | 
				
			||||||
    for attr, value in spec.items():
 | 
					    for attr, value in spec.items():
 | 
				
			||||||
        if isinstance(attr, basestring):
 | 
					        if isinstance(attr, basestring):
 | 
				
			||||||
 | 
					            attr = attr.upper()
 | 
				
			||||||
            if attr == '_':
 | 
					            if attr == '_':
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            elif attr.upper() == "OP":
 | 
					            elif attr == "OP":
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            if attr.upper() == "TEXT":
 | 
					            if attr == "TEXT":
 | 
				
			||||||
                attr = "ORTH"
 | 
					                attr = "ORTH"
 | 
				
			||||||
            attr = IDS.get(attr.upper())
 | 
					            if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E152.format(attr=attr))
 | 
				
			||||||
 | 
					            attr = IDS.get(attr)
 | 
				
			||||||
        if isinstance(value, basestring):
 | 
					        if isinstance(value, basestring):
 | 
				
			||||||
            value = string_store.add(value)
 | 
					            value = string_store.add(value)
 | 
				
			||||||
        elif isinstance(value, bool):
 | 
					        elif isinstance(value, bool):
 | 
				
			||||||
            value = int(value)
 | 
					            value = int(value)
 | 
				
			||||||
        elif isinstance(value, dict):
 | 
					        elif isinstance(value, dict):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E153.format(vtype=type(value).__name__))
 | 
				
			||||||
        if attr is not None:
 | 
					        if attr is not None:
 | 
				
			||||||
            attr_values.append((attr, value))
 | 
					            attr_values.append((attr, value))
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            # should be caught above using TOKEN_PATTERN_SCHEMA
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E152.format(attr=attr))
 | 
				
			||||||
    return attr_values
 | 
					    return attr_values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -755,11 +776,13 @@ def _get_operators(spec):
 | 
				
			||||||
        return lookup[spec["OP"]]
 | 
					        return lookup[spec["OP"]]
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        keys = ", ".join(lookup.keys())
 | 
					        keys = ", ".join(lookup.keys())
 | 
				
			||||||
        raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys))
 | 
					        raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_extensions(spec, string_store, name2index):
 | 
					def _get_extensions(spec, string_store, name2index):
 | 
				
			||||||
    attr_values = []
 | 
					    attr_values = []
 | 
				
			||||||
 | 
					    if not isinstance(spec.get("_", {}), dict):
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E154.format())
 | 
				
			||||||
    for name, value in spec.get("_", {}).items():
 | 
					    for name, value in spec.get("_", {}).items():
 | 
				
			||||||
        if isinstance(value, dict):
 | 
					        if isinstance(value, dict):
 | 
				
			||||||
            # Handle predicates (e.g. "IN", in the extra_predicates, not here.
 | 
					            # Handle predicates (e.g. "IN", in the extra_predicates, not here.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,7 @@ from ..vocab cimport Vocab
 | 
				
			||||||
from ..tokens.doc cimport Doc, get_token_attr
 | 
					from ..tokens.doc cimport Doc, get_token_attr
 | 
				
			||||||
from ..typedefs cimport attr_t, hash_t
 | 
					from ..typedefs cimport attr_t, hash_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ._schemas import TOKEN_PATTERN_SCHEMA
 | 
				
			||||||
from ..errors import Errors, Warnings, deprecation_warning, user_warning
 | 
					from ..errors import Errors, Warnings, deprecation_warning, user_warning
 | 
				
			||||||
from ..attrs import FLAG61 as U_ENT
 | 
					from ..attrs import FLAG61 as U_ENT
 | 
				
			||||||
from ..attrs import FLAG60 as B2_ENT
 | 
					from ..attrs import FLAG60 as B2_ENT
 | 
				
			||||||
| 
						 | 
					@ -62,6 +63,11 @@ cdef class PhraseMatcher:
 | 
				
			||||||
        if isinstance(attr, long):
 | 
					        if isinstance(attr, long):
 | 
				
			||||||
            self.attr = attr
 | 
					            self.attr = attr
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
 | 
					            attr = attr.upper()
 | 
				
			||||||
 | 
					            if attr == "TEXT":
 | 
				
			||||||
 | 
					                attr = "ORTH"
 | 
				
			||||||
 | 
					            if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E152.format(attr=attr))
 | 
				
			||||||
            self.attr = self.vocab.strings[attr]
 | 
					            self.attr = self.vocab.strings[attr]
 | 
				
			||||||
        self.phrase_ids = PreshMap()
 | 
					        self.phrase_ids = PreshMap()
 | 
				
			||||||
        abstract_patterns = [
 | 
					        abstract_patterns = [
 | 
				
			||||||
| 
						 | 
					@ -123,6 +129,10 @@ cdef class PhraseMatcher:
 | 
				
			||||||
            length = doc.length
 | 
					            length = doc.length
 | 
				
			||||||
            if length == 0:
 | 
					            if length == 0:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
					            if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E155.format())
 | 
				
			||||||
 | 
					            if self.attr == DEP and not doc.is_parsed:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E156.format())
 | 
				
			||||||
            if self._validate and (doc.is_tagged or doc.is_parsed) \
 | 
					            if self._validate and (doc.is_tagged or doc.is_parsed) \
 | 
				
			||||||
              and self.attr not in (DEP, POS, TAG, LEMMA):
 | 
					              and self.attr not in (DEP, POS, TAG, LEMMA):
 | 
				
			||||||
                string_attr = self.vocab.strings[self.attr]
 | 
					                string_attr = self.vocab.strings[self.attr]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -54,6 +54,8 @@ class EntityRuler(object):
 | 
				
			||||||
        self.phrase_patterns = defaultdict(list)
 | 
					        self.phrase_patterns = defaultdict(list)
 | 
				
			||||||
        self.matcher = Matcher(nlp.vocab, validate=validate)
 | 
					        self.matcher = Matcher(nlp.vocab, validate=validate)
 | 
				
			||||||
        if phrase_matcher_attr is not None:
 | 
					        if phrase_matcher_attr is not None:
 | 
				
			||||||
 | 
					            if phrase_matcher_attr.upper() == "TEXT":
 | 
				
			||||||
 | 
					                phrase_matcher_attr = "ORTH"
 | 
				
			||||||
            self.phrase_matcher_attr = phrase_matcher_attr
 | 
					            self.phrase_matcher_attr = phrase_matcher_attr
 | 
				
			||||||
            self.phrase_matcher = PhraseMatcher(
 | 
					            self.phrase_matcher = PhraseMatcher(
 | 
				
			||||||
                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
 | 
					                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,8 +10,8 @@ from spacy.util import get_lang_class
 | 
				
			||||||
# excluded: ja, ru, th, uk, vi, zh
 | 
					# excluded: ja, ru, th, uk, vi, zh
 | 
				
			||||||
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
 | 
					LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
 | 
				
			||||||
             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
 | 
					             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
 | 
				
			||||||
             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "rs", "si",
 | 
					             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
 | 
				
			||||||
             "sk", "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
 | 
					             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -344,3 +344,39 @@ def test_dependency_matcher_compile(dependency_matcher):
 | 
				
			||||||
#     assert matches[0][1] == [[3, 1, 2]]
 | 
					#     assert matches[0][1] == [[3, 1, 2]]
 | 
				
			||||||
#     assert matches[1][1] == [[4, 3, 3]]
 | 
					#     assert matches[1][1] == [[4, 3, 3]]
 | 
				
			||||||
#     assert matches[2][1] == [[4, 3, 2]]
 | 
					#     assert matches[2][1] == [[4, 3, 2]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_attr_pipeline_checks(en_vocab):
 | 
				
			||||||
 | 
					    doc1 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
 | 
					    doc1.is_parsed = True
 | 
				
			||||||
 | 
					    doc2 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
 | 
					    doc2.is_tagged = True
 | 
				
			||||||
 | 
					    doc3 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
 | 
					    # DEP requires is_parsed
 | 
				
			||||||
 | 
					    matcher = Matcher(en_vocab)
 | 
				
			||||||
 | 
					    matcher.add("TEST", None, [{"DEP": "a"}])
 | 
				
			||||||
 | 
					    matcher(doc1)
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        matcher(doc2)
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        matcher(doc3)
 | 
				
			||||||
 | 
					    # TAG, POS, LEMMA require is_tagged
 | 
				
			||||||
 | 
					    for attr in ("TAG", "POS", "LEMMA"):
 | 
				
			||||||
 | 
					        matcher = Matcher(en_vocab)
 | 
				
			||||||
 | 
					        matcher.add("TEST", None, [{attr: "a"}])
 | 
				
			||||||
 | 
					        matcher(doc2)
 | 
				
			||||||
 | 
					        with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					            matcher(doc1)
 | 
				
			||||||
 | 
					        with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					            matcher(doc3)
 | 
				
			||||||
 | 
					    # TEXT/ORTH only require tokens
 | 
				
			||||||
 | 
					    matcher = Matcher(en_vocab)
 | 
				
			||||||
 | 
					    matcher.add("TEST", None, [{"ORTH": "a"}])
 | 
				
			||||||
 | 
					    matcher(doc1)
 | 
				
			||||||
 | 
					    matcher(doc2)
 | 
				
			||||||
 | 
					    matcher(doc3)
 | 
				
			||||||
 | 
					    matcher = Matcher(en_vocab)
 | 
				
			||||||
 | 
					    matcher.add("TEST", None, [{"TEXT": "a"}])
 | 
				
			||||||
 | 
					    matcher(doc1)
 | 
				
			||||||
 | 
					    matcher(doc2)
 | 
				
			||||||
 | 
					    matcher(doc3)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,36 @@ from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
 | 
				
			||||||
from spacy.errors import MatchPatternError
 | 
					from spacy.errors import MatchPatternError
 | 
				
			||||||
from spacy.util import get_json_validator, validate_json
 | 
					from spacy.util import get_json_validator, validate_json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# (pattern, num errors with validation, num errors identified with minimal
 | 
				
			||||||
 | 
					#  checks)
 | 
				
			||||||
 | 
					TEST_PATTERNS = [
 | 
				
			||||||
 | 
					    # Bad patterns flagged in all cases
 | 
				
			||||||
 | 
					    ([{"XX": "foo"}], 1, 1),
 | 
				
			||||||
 | 
					    ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1),
 | 
				
			||||||
 | 
					    ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
 | 
				
			||||||
 | 
					    ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
 | 
				
			||||||
 | 
					    ([{"IS_DIGIT": -1}], 1, 1),
 | 
				
			||||||
 | 
					    ([{"ORTH": -1}], 1, 1),
 | 
				
			||||||
 | 
					    ([{"_": "foo"}], 1, 1),
 | 
				
			||||||
 | 
					    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
 | 
				
			||||||
 | 
					    ([1, 2, 3], 3, 1),
 | 
				
			||||||
 | 
					    # Bad patterns flagged outside of Matcher
 | 
				
			||||||
 | 
					    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
 | 
				
			||||||
 | 
					    # Bad patterns not flagged with minimal checks
 | 
				
			||||||
 | 
					    ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
 | 
				
			||||||
 | 
					    ([{"LENGTH": {"VALUE": 5}}], 1, 0),
 | 
				
			||||||
 | 
					    ([{"TEXT": {"VALUE": "foo"}}], 1, 0),
 | 
				
			||||||
 | 
					    # Good patterns
 | 
				
			||||||
 | 
					    ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
 | 
				
			||||||
 | 
					    ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
 | 
				
			||||||
 | 
					    ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
 | 
				
			||||||
 | 
					    ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
 | 
				
			||||||
 | 
					    ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
 | 
				
			||||||
 | 
					    ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def validator():
 | 
					def validator():
 | 
				
			||||||
| 
						 | 
					@ -22,27 +52,24 @@ def test_matcher_pattern_validation(en_vocab, pattern):
 | 
				
			||||||
        matcher.add("TEST", None, pattern)
 | 
					        matcher.add("TEST", None, pattern)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
 | 
				
			||||||
    "pattern,n_errors",
 | 
					def test_pattern_validation(validator, pattern, n_errors, _):
 | 
				
			||||||
    [
 | 
					 | 
				
			||||||
        # Bad patterns
 | 
					 | 
				
			||||||
        ([{"XX": "foo"}], 1),
 | 
					 | 
				
			||||||
        ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2),
 | 
					 | 
				
			||||||
        ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2),
 | 
					 | 
				
			||||||
        ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2),
 | 
					 | 
				
			||||||
        ([{"TEXT": {"VALUE": "foo"}}], 1),
 | 
					 | 
				
			||||||
        ([{"LENGTH": {"VALUE": 5}}], 1),
 | 
					 | 
				
			||||||
        ([{"_": "foo"}], 1),
 | 
					 | 
				
			||||||
        ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1),
 | 
					 | 
				
			||||||
        ([{"IS_PUNCT": True, "OP": "$"}], 1),
 | 
					 | 
				
			||||||
        # Good patterns
 | 
					 | 
				
			||||||
        ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0),
 | 
					 | 
				
			||||||
        ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0),
 | 
					 | 
				
			||||||
        ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0),
 | 
					 | 
				
			||||||
        ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0),
 | 
					 | 
				
			||||||
        ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0),
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_pattern_validation(validator, pattern, n_errors):
 | 
					 | 
				
			||||||
    errors = validate_json(pattern, validator)
 | 
					    errors = validate_json(pattern, validator)
 | 
				
			||||||
    assert len(errors) == n_errors
 | 
					    assert len(errors) == n_errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.xfail
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
 | 
				
			||||||
 | 
					def test_xfail_pattern_validation(validator, pattern, n_errors, _):
 | 
				
			||||||
 | 
					    errors = validate_json(pattern, validator)
 | 
				
			||||||
 | 
					    assert len(errors) == n_errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
 | 
				
			||||||
 | 
					def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
 | 
				
			||||||
 | 
					    matcher = Matcher(en_vocab)
 | 
				
			||||||
 | 
					    if n_min_errors > 0:
 | 
				
			||||||
 | 
					        with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					            matcher.add("TEST", None, pattern)
 | 
				
			||||||
 | 
					    elif n_errors == 0:
 | 
				
			||||||
 | 
					        matcher.add("TEST", None, pattern)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -99,3 +99,36 @@ def test_phrase_matcher_validation(en_vocab):
 | 
				
			||||||
    with pytest.warns(None) as record:
 | 
					    with pytest.warns(None) as record:
 | 
				
			||||||
        matcher.add("TEST4", None, doc2)
 | 
					        matcher.add("TEST4", None, doc2)
 | 
				
			||||||
        assert not record.list
 | 
					        assert not record.list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_attr_validation(en_vocab):
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        PhraseMatcher(en_vocab, attr="UNSUPPORTED")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_attr_pipeline_checks(en_vocab):
 | 
				
			||||||
 | 
					    doc1 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
 | 
					    doc1.is_parsed = True
 | 
				
			||||||
 | 
					    doc2 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
 | 
					    doc2.is_tagged = True
 | 
				
			||||||
 | 
					    doc3 = Doc(en_vocab, words=["Test"])
 | 
				
			||||||
 | 
					    # DEP requires is_parsed
 | 
				
			||||||
 | 
					    matcher = PhraseMatcher(en_vocab, attr="DEP")
 | 
				
			||||||
 | 
					    matcher.add("TEST1", None, doc1)
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        matcher.add("TEST2", None, doc2)
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        matcher.add("TEST3", None, doc3)
 | 
				
			||||||
 | 
					    # TAG, POS, LEMMA require is_tagged
 | 
				
			||||||
 | 
					    for attr in ("TAG", "POS", "LEMMA"):
 | 
				
			||||||
 | 
					        matcher = PhraseMatcher(en_vocab, attr=attr)
 | 
				
			||||||
 | 
					        matcher.add("TEST2", None, doc2)
 | 
				
			||||||
 | 
					        with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					            matcher.add("TEST1", None, doc1)
 | 
				
			||||||
 | 
					        with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					            matcher.add("TEST3", None, doc3)
 | 
				
			||||||
 | 
					    # TEXT/ORTH only require tokens
 | 
				
			||||||
 | 
					    matcher = PhraseMatcher(en_vocab, attr="ORTH")
 | 
				
			||||||
 | 
					    matcher.add("TEST3", None, doc3)
 | 
				
			||||||
 | 
					    matcher = PhraseMatcher(en_vocab, attr="TEXT")
 | 
				
			||||||
 | 
					    matcher.add("TEST3", None, doc3)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -137,7 +137,8 @@ def test_entity_ruler_validate(nlp):
 | 
				
			||||||
    valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
 | 
					    valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
 | 
				
			||||||
    invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
 | 
					    invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # invalid pattern is added without errors without validate
 | 
					    # invalid pattern raises error without validate
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        ruler.add_patterns([invalid_pattern])
 | 
					        ruler.add_patterns([invalid_pattern])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # valid pattern is added without errors with validate
 | 
					    # valid pattern is added without errors with validate
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -859,12 +859,12 @@ token pattern covering the exact tokenization of the term.
 | 
				
			||||||
<Infobox title="Important note on creating patterns" variant="warning">
 | 
					<Infobox title="Important note on creating patterns" variant="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To create the patterns, each phrase has to be processed with the `nlp` object.
 | 
					To create the patterns, each phrase has to be processed with the `nlp` object.
 | 
				
			||||||
If you have a mode loaded, doing this in a loop or list comprehension can easily
 | 
					If you have a model loaded, doing this in a loop or list comprehension can
 | 
				
			||||||
become inefficient and slow. If you **only need the tokenization and lexical
 | 
					easily become inefficient and slow. If you **only need the tokenization and
 | 
				
			||||||
attributes**, you can run [`nlp.make_doc`](/api/language#make_doc) instead,
 | 
					lexical attributes**, you can run [`nlp.make_doc`](/api/language#make_doc)
 | 
				
			||||||
which will only run the tokenizer. For an additional speed boost, you can also
 | 
					instead, which will only run the tokenizer. For an additional speed boost, you
 | 
				
			||||||
use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will process
 | 
					can also use the [`nlp.tokenizer.pipe`](/api/tokenizer#pipe) method, which will
 | 
				
			||||||
the texts as a stream.
 | 
					process the texts as a stream.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```diff
 | 
					```diff
 | 
				
			||||||
- patterns = [nlp(term) for term in LOTS_OF_TERMS]
 | 
					- patterns = [nlp(term) for term in LOTS_OF_TERMS]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -127,7 +127,7 @@
 | 
				
			||||||
        { "code": "is", "name": "Icelandic" },
 | 
					        { "code": "is", "name": "Icelandic" },
 | 
				
			||||||
        { "code": "lt", "name": "Lithuanian" },
 | 
					        { "code": "lt", "name": "Lithuanian" },
 | 
				
			||||||
        { "code": "lv", "name": "Latvian" },
 | 
					        { "code": "lv", "name": "Latvian" },
 | 
				
			||||||
        { "code": "rs", "name": "Serbian" },
 | 
					        { "code": "sr", "name": "Serbian" },
 | 
				
			||||||
        { "code": "sk", "name": "Slovak" },
 | 
					        { "code": "sk", "name": "Slovak" },
 | 
				
			||||||
        { "code": "sl", "name": "Slovenian" },
 | 
					        { "code": "sl", "name": "Slovenian" },
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1089,6 +1089,62 @@
 | 
				
			||||||
            "youtube": "6zm9NC9uRkk",
 | 
					            "youtube": "6zm9NC9uRkk",
 | 
				
			||||||
            "category": ["videos"]
 | 
					            "category": ["videos"]
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "type": "education",
 | 
				
			||||||
 | 
					            "id": "video-intro-to-nlp-episode-1",
 | 
				
			||||||
 | 
					            "title": "Intro to NLP with spaCy",
 | 
				
			||||||
 | 
					            "slogan": "Episode 1: Data exploration",
 | 
				
			||||||
 | 
					            "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
 | 
				
			||||||
 | 
					            "author": "Vincent Warmerdam",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "fishnets88",
 | 
				
			||||||
 | 
					                "github": "koaning"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "youtube": "WnGPv6HnBok",
 | 
				
			||||||
 | 
					            "category": ["videos"]
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "type": "education",
 | 
				
			||||||
 | 
					            "id": "video-spacy-irl-entity-linking",
 | 
				
			||||||
 | 
					            "title": "Entity Linking functionality in spaCy",
 | 
				
			||||||
 | 
					            "slogan": "spaCy IRL 2019",
 | 
				
			||||||
 | 
					            "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
 | 
				
			||||||
 | 
					            "author": "Sofie Van Landeghem",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "OxyKodit",
 | 
				
			||||||
 | 
					                "github": "svlandeg"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "youtube": "PW3RJM8tDGo",
 | 
				
			||||||
 | 
					            "category": ["videos"]
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "type": "education",
 | 
				
			||||||
 | 
					            "id": "video-spacy-irl-lemmatization",
 | 
				
			||||||
 | 
					            "title": "Rethinking rule-based lemmatization",
 | 
				
			||||||
 | 
					            "slogan": "spaCy IRL 2019",
 | 
				
			||||||
 | 
					            "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
 | 
				
			||||||
 | 
					            "author": "Guadalupe Romero",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "_guadiromero",
 | 
				
			||||||
 | 
					                "github": "guadi1994"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "youtube": "88zcQODyuko",
 | 
				
			||||||
 | 
					            "category": ["videos"]
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "type": "education",
 | 
				
			||||||
 | 
					            "id": "video-spacy-irl-scispacy",
 | 
				
			||||||
 | 
					            "title": "ScispaCy: A spaCy pipeline & models for scientific & biomedical text",
 | 
				
			||||||
 | 
					            "slogan": "spaCy IRL 2019",
 | 
				
			||||||
 | 
					            "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc",
 | 
				
			||||||
 | 
					            "author": "Mark Neumann",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "MarkNeumannnn",
 | 
				
			||||||
 | 
					                "github": "DeNeutoy"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "youtube": "2_HSKDALwuw",
 | 
				
			||||||
 | 
					            "category": ["videos"]
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "type": "education",
 | 
					            "type": "education",
 | 
				
			||||||
            "id": "podcast-nlp-highlights",
 | 
					            "id": "podcast-nlp-highlights",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,6 +86,7 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC
 | 
				
			||||||
                                        <img
 | 
					                                        <img
 | 
				
			||||||
                                            src={`https://img.youtube.com/vi/${youtube}/0.jpg`}
 | 
					                                            src={`https://img.youtube.com/vi/${youtube}/0.jpg`}
 | 
				
			||||||
                                            alt=""
 | 
					                                            alt=""
 | 
				
			||||||
 | 
					                                            style={{ clipPath: 'inset(12.5% 0)' }}
 | 
				
			||||||
                                        />
 | 
					                                        />
 | 
				
			||||||
                                    )
 | 
					                                    )
 | 
				
			||||||
                                    return cover ? (
 | 
					                                    return cover ? (
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user