diff --git a/website/usage/_data.json b/website/usage/_data.json index b34304ed6..f77f7929c 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -103,11 +103,10 @@ "title": "Language Processing Pipelines", "next": "vectors-similarity", "menu": { - "How pipelines work": "pipelines", - "Examples": "examples", + "How Pipelines Work": "pipelines", + "Custom Components": "custom-components", "Multi-threading": "multithreading", - "User Hooks": "user-hooks", - "Serialization": "serialization" + "Serialization": "serialization", } }, diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade new file mode 100644 index 000000000..13f0cb85c --- /dev/null +++ b/website/usage/_processing-pipelines/_custom-components.jade @@ -0,0 +1,151 @@ +//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS + +p + | A component receives a #[code Doc] object and + | #[strong performs the actual processing] – for example, using the current + | weights to make a prediction and set some annotation on the document. By + | adding a component to the pipeline, you'll get access to the #[code Doc] + | at any point #[strong during] processing – instead of only being able to + | modify it afterwards. + ++aside-code("Example"). + def my_component(doc): + # do something to the doc here + return doc + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The #[code Doc] object processed by the previous component. + + +row("foot") + +cell returns + +cell #[code Doc] + +cell The #[code Doc] object processed by this pipeline component. + +p + | Custom components can be added to the pipeline using the + | #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you + | can either specify a component to add it before or after, tell spaCy + | to add it first or last in the pipeline, or define a custom name. + | If no name is set and no #[code name] attribute is present on your + | component, the function name, e.g. #[code component.__name__] is used. + ++code("Adding pipeline components"). + def my_component(doc): + print("After tokenization, this doc has %s tokens." % len(doc)) + if len(doc) < 10: + print("This is a pretty short document.") + return doc + + nlp = spacy.load('en') + nlp.pipeline.add_pipe(my_component, name='print_info', first=True) + print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] + doc = nlp(u"This is a sentence.") + +p + | Of course, you can also wrap your component as a class to allow + | initialising it with custom settings and hold state within the component. + | This is useful for #[strong stateful components], especially ones which + | #[strong depend on shared data]. + ++code. + class MyComponent(object): + name = 'print_info' + + def __init__(vocab, short_limit=10): + self.vocab = nlp.vocab + self.short_limit = short_limit + + def __call__(doc): + if len(doc) < self.short_limit: + print("This is a pretty short document.") + return doc + + my_component = MyComponent(nlp.vocab, short_limit=25) + nlp.add_pipe(my_component, first=True) + ++h(3, "custom-components-attributes") + | Setting attributes on the #[code Doc], #[code Span] and #[code Token] + ++aside("Why ._?") + | Writing to a #[code ._] attribute instead of to the #[code Doc] directly + | keeps a clearer separation and makes it easier to ensure backwards + | compatibility. For example, if you've implemented your own #[code .coref] + | property and spaCy claims it one day, it'll break your code. Similarly, + | just by looking at the code, you'll immediately know what's built-in and + | what's custom – for example, #[code doc.sentiment] is spaCy, while + | #[code doc._.sent_score] isn't. + ++under-construction + ++h(3, "custom-components-user-hooks") Other user hooks + +p + | While it's generally recommended to use the #[code Doc._], #[code Span._] + | and #[code Token._] proxies to add your own custom attributes, spaCy + | offers a few exceptions to allow #[strong customising the built-in methods] + | like #[+api("doc#similarity") #[code Doc.similarity]] or + | #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can + | rely on statistical models you train yourself. For instance, you can + | provide your own on-the-fly sentence segmentation algorithm or document + | similarity method. + +p + | Hooks let you customize some of the behaviours of the #[code Doc], + | #[code Span] or #[code Token] objects by adding a component to the + | pipeline. For instance, to customize the + | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a + | component that sets a custom function to + | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] + | method will check the #[code user_hooks] dict, and delegate to your + | function if you've set one. Similar results can be achieved by setting + | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. + ++aside("Implementation note") + | The hooks live on the #[code Doc] object because the #[code Span] and + | #[code Token] objects are created lazily, and don't own any data. They + | just proxy to their parent #[code Doc]. This turns out to be convenient + | here — we only have to worry about installing hooks in one place. + ++table(["Name", "Customises"]) + +row + +cell #[code user_hooks] + +cell + +api("doc#vector") #[code Doc.vector] + +api("doc#has_vector") #[code Doc.has_vector] + +api("doc#vector_norm") #[code Doc.vector_norm] + +api("doc#sents") #[code Doc.sents] + + +row + +cell #[code user_token_hooks] + +cell + +api("token#similarity") #[code Token.similarity] + +api("token#vector") #[code Token.vector] + +api("token#has_vector") #[code Token.has_vector] + +api("token#vector_norm") #[code Token.vector_norm] + +api("token#conjuncts") #[code Token.conjuncts] + + +row + +cell #[code user_span_hooks] + +cell + +api("span#similarity") #[code Span.similarity] + +api("span#vector") #[code Span.vector] + +api("span#has_vector") #[code Span.has_vector] + +api("span#vector_norm") #[code Span.vector_norm] + +api("span#root") #[code Span.root] + ++code("Add custom similarity hooks"). + class SimilarityModel(object): + def __init__(self, model): + self._model = model + + def __call__(self, doc): + doc.user_hooks['similarity'] = self.similarity + doc.user_span_hooks['similarity'] = self.similarity + doc.user_token_hooks['similarity'] = self.similarity + + def similarity(self, obj1, obj2): + y = self._model([obj1.vector, obj2.vector]) + return float(y[0]) diff --git a/website/usage/_processing-pipelines/_user-hooks.jade b/website/usage/_processing-pipelines/_user-hooks.jade deleted file mode 100644 index e7dce53fe..000000000 --- a/website/usage/_processing-pipelines/_user-hooks.jade +++ /dev/null @@ -1,61 +0,0 @@ -//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS - -p - | Hooks let you customize some of the behaviours of the #[code Doc], - | #[code Span] or #[code Token] objects by adding a component to the - | pipeline. For instance, to customize the - | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a - | component that sets a custom function to - | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] - | method will check the #[code user_hooks] dict, and delegate to your - | function if you've set one. Similar results can be achieved by setting - | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. - -+code("Polymorphic similarity example"). - span.similarity(doc) - token.similarity(span) - doc1.similarity(doc2) - -p - | By default, this just averages the vectors for each document, and - | computes their cosine. Obviously, spaCy should make it easy for you to - | install your own similarity model. This introduces a tricky design - | challenge. The current solution is to add three more dicts to the - | #[code Doc] object: - -+aside("Implementation note") - | The hooks live on the #[code Doc] object because the #[code Span] and - | #[code Token] objects are created lazily, and don't own any data. They - | just proxy to their parent #[code Doc]. This turns out to be convenient - | here — we only have to worry about installing hooks in one place. - -+table(["Name", "Description"]) - +row - +cell #[code user_hooks] - +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] - - +row - +cell #[code user_token_hooks] - +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] - - +row - +cell #[code user_span_hooks] - +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] - -p - | To sum up, here's an example of hooking in custom #[code .similarity()] - | methods: - -+code("Add custom similarity hooks"). - class SimilarityModel(object): - def __init__(self, model): - self._model = model - - def __call__(self, doc): - doc.user_hooks['similarity'] = self.similarity - doc.user_span_hooks['similarity'] = self.similarity - doc.user_token_hooks['similarity'] = self.similarity - - def similarity(self, obj1, obj2): - y = self._model([obj1.vector, obj2.vector]) - return float(y[0]) diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade index 0bb96780e..0d0579883 100644 --- a/website/usage/processing-pipelines.jade +++ b/website/usage/processing-pipelines.jade @@ -8,18 +8,14 @@ include _spacy-101/_pipelines +h(2, "pipelines") How pipelines work include _processing-pipelines/_pipelines -+section("examples") - +h(2, "examples") Examples - include _processing-pipelines/_examples ++section("custom-components") + +h(2, "custom-components") Creating custom pipeline components + include _processing-pipelines/_custom-components +section("multithreading") +h(2, "multithreading") Multi-threading include _processing-pipelines/_multithreading -+section("user-hooks") - +h(2, "user-hooks") User hooks - include _processing-pipelines/_user-hooks - +section("serialization") +h(2, "serialization") Serialization include _processing-pipelines/_serialization