diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index b03aba568..ab995ceee 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -124,6 +124,12 @@ mixin help(tooltip, icon_size) +icon("help_o", icon_size || 16).o-icon--inline +//- Abbreviation + +mixin abbr(title) + abbr.o-abbr(data-tooltip=title data-tooltip-style="code" aria-label=title)&attributes(attributes) + block + //- Aside wrapper label - [string] aside label diff --git a/website/_includes/_sidebar.jade b/website/_includes/_sidebar.jade index 926fb46fa..6bce6630c 100644 --- a/website/_includes/_sidebar.jade +++ b/website/_includes/_sidebar.jade @@ -9,7 +9,7 @@ menu.c-sidebar.js-sidebar.u-text each url, item in items - var is_current = CURRENT == url || (CURRENT == "index" && url == "./") li.c-sidebar__item - +a(url)(class=is_current ? "is-active" : null tabindex=is_current ? "-1" : null)=item + +a(url)(class=is_current ? "is-active" : null tabindex=is_current ? "-1" : null data-sidebar-active=is_current ? "" : null)=item if is_current if IS_MODELS && CURRENT_MODELS.length diff --git a/website/api/_architecture/_cython.jade b/website/api/_architecture/_cython.jade deleted file mode 100644 index 84b98b824..000000000 --- a/website/api/_architecture/_cython.jade +++ /dev/null @@ -1,115 +0,0 @@ -//- πŸ’« DOCS > API > ARCHITECTURE > CYTHON - -+aside("What's Cython?") - | #[+a("http://cython.org/") Cython] is a language for writing - | C extensions for Python. Most Python code is also valid Cython, but - | you can add type declarations to get efficient memory-managed code - | just like C or C++. - -p - | spaCy's core data structures are implemented as - | #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is - | managed through the #[+a(gh("cymem")) #[code cymem]] - | #[code cymem.Pool] class, which allows you - | to allocate memory which will be freed when the #[code Pool] object - | is garbage collected. This means you usually don't have to worry - | about freeing memory. You just have to decide which Python object - | owns the memory, and make it own the #[code Pool]. When that object - | goes out of scope, the memory will be freed. You do have to take - | care that no pointers outlive the object that owns them β€” but this - | is generally quite easy. - -p - | All Cython modules should have the #[code # cython: infer_types=True] - | compiler directive at the top of the file. This makes the code much - | cleaner, as it avoids the need for many type declarations. If - | possible, you should prefer to declare your functions #[code nogil], - | even if you don't especially care about multi-threading. The reason - | is that #[code nogil] functions help the Cython compiler reason about - | your code quite a lot β€” you're telling the compiler that no Python - | dynamics are possible. This lets many errors be raised, and ensures - | your function will run at C speed. - - -p - | Cython gives you many choices of sequences: you could have a Python - | list, a numpy array, a memory view, a C++ vector, or a pointer. - | Pointers are preferred, because they are fastest, have the most - | explicit semantics, and let the compiler check your code more - | strictly. C++ vectors are also great β€” but you should only use them - | internally in functions. It's less friendly to accept a vector as an - | argument, because that asks the user to do much more work. Here's - | how to get a pointer from a numpy array, memory view or vector: - -+code. - cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil: - pointer1 = <int*>numpy_array.data - pointer2 = cpp_vector.data() - pointer3 = &memory_view[0] - -p - | Both C arrays and C++ vectors reassure the compiler that no Python - | operations are possible on your variable. This is a big advantage: - | it lets the Cython compiler raise many more errors for you. - -p - | When getting a pointer from a numpy array or memoryview, take care - | that the data is actually stored in C-contiguous order β€” otherwise - | you'll get a pointer to nonsense. The type-declarations in the code - | above should generate runtime errors if buffers with incorrect - | memory layouts are passed in. To iterate over the array, the - | following style is preferred: - -+code. - cdef int c_total(const int* int_array, int length) nogil: - total = 0 - for item in int_array[:length]: - total += item - return total - -p - | If this is confusing, consider that the compiler couldn't deal with - | #[code for item in int_array:] β€” there's no length attached to a raw - | pointer, so how could we figure out where to stop? The length is - | provided in the slice notation as a solution to this. Note that we - | don't have to declare the type of #[code item] in the code above β€” - | the compiler can easily infer it. This gives us tidy code that looks - | quite like Python, but is exactly as fast as C β€” because we've made - | sure the compilation to C is trivial. - -p - | Your functions cannot be declared #[code nogil] if they need to - | create Python objects or call Python functions. This is perfectly - | okay β€” you shouldn't torture your code just to get #[code nogil] - | functions. However, if your function isn't #[code nogil], you should - | compile your module with #[code cython -a --cplus my_module.pyx] and - | open the resulting #[code my_module.html] file in a browser. This - | will let you see how Cython is compiling your code. Calls into the - | Python run-time will be in bright yellow. This lets you easily see - | whether Cython is able to correctly type your code, or whether there - | are unexpected problems. - -p - | Working in Cython is very rewarding once you're over the initial - | learning curve. As with C and C++, the first way you write something - | in Cython will often be the performance-optimal approach. In - | contrast, Python optimisation generally requires a lot of - | experimentation. Is it faster to have an #[code if item in my_dict] - | check, or to use #[code .get()]? What about - | #[code try]/#[code except]? Does this numpy operation create a copy? - | There's no way to guess the answers to these questions, and you'll - | usually be dissatisfied with your results β€” so there's no way to - | know when to stop this process. In the worst case, you'll make a - | mess that invites the next reader to try their luck too. This is - | like one of those - | #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps], - | where the rescuers keep passing out from low oxygen, causing - | another rescuer to follow β€” only to succumb themselves. In short, - | just say no to optimizing your Python. If it's not fast enough the - | first time, just switch to Cython. - -+infobox("Resources") - +list.o-no-block - +item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org) - +item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai) - +item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai) diff --git a/website/api/_architecture/_nn-model.jade b/website/api/_architecture/_nn-model.jade deleted file mode 100644 index b7c32eae6..000000000 --- a/website/api/_architecture/_nn-model.jade +++ /dev/null @@ -1,149 +0,0 @@ -//- πŸ’« DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE - -p - | spaCy's statistical models have been custom-designed to give a - | high-performance mix of speed and accuracy. The current architecture - | hasn't been published yet, but in the meantime we prepared a video that - | explains how the models work, with particular focus on NER. - -+youtube("sqDHBH9IjRU") - -p - | The parsing model is a blend of recent results. The two recent - | inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at - | Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of - | the parser is still based on the work of Joakim Nivre#[+fn(2)], who - | introduced the transition-based framework#[+fn(3)], the arc-eager - | transition system, and the imitation learning objective. The model is - | implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning - | library. We first predict context-sensitive vectors for each word in the - | input: - -+code. - (embed_lower | embed_prefix | embed_suffix | embed_shape) - >> Maxout(token_width) - >> convolution ** 4 - -p - | This convolutional layer is shared between the tagger, parser and NER, - | and will also be shared by the future neural lemmatizer. Because the - | parser shares these layers with the tagger, the parser does not require - | tag features. I got this trick from David Weiss's "Stack Combination" - | paper#[+fn(4)]. - -p - | To boost the representation, the tagger actually predicts a "super tag" - | with POS, morphology and dependency label#[+fn(5)]. The tagger predicts - | these supertags by adding a softmax layer onto the convolutional layer – - | so, we're teaching the convolutional layer to give us a representation - | that's one affine transform from this informative lexical information. - | This is obviously good for the parser (which backprops to the - | convolutions too). The parser model makes a state vector by concatenating - | the vector representations for its context tokens. The current context - | tokens: - -+table - +row - +cell #[code S0], #[code S1], #[code S2] - +cell Top three words on the stack. - - +row - +cell #[code B0], #[code B1] - +cell First two words of the buffer. - - +row - +cell - | #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1], - | #[code B1L1]#[br] - | #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2], - | #[code B1L2] - +cell - | Leftmost and second leftmost children of #[code S0], #[code S1], - | #[code S2], #[code B0] and #[code B1]. - - +row - +cell - | #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1], - | #[code B1R1]#[br] - | #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2], - | #[code B1R2] - +cell - | Rightmost and second rightmost children of #[code S0], #[code S1], - | #[code S2], #[code B0] and #[code B1]. - -p - | This makes the state vector quite long: #[code 13*T], where #[code T] is - | the token vector width (128 is working well). Fortunately, there's a way - | to structure the computation to save some expense (and make it more - | GPU-friendly). - -p - | The parser typically visits #[code 2*N] states for a sentence of length - | #[code N] (although it may visit more, if it back-tracks with a - | non-monotonic transition#[+fn(4)]). A naive implementation would require - | #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of - | size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)] - | multiplication, to pre-compute the hidden weights for each positional - | feature with respect to the words in the batch. (Note that our token - | vectors come from the CNN β€” so we can't play this trick over the - | vocabulary. That's how Stanford's NN parser#[+fn(3)] works β€” and why its - | model is so big.) - -p - | This pre-computation strategy allows a nice compromise between - | GPU-friendliness and implementation simplicity. The CNN and the wide - | lower layer are computed on the GPU, and then the precomputed hidden - | weights are moved to the CPU, before we start the transition-based - | parsing process. This makes a lot of things much easier. We don't have to - | worry about variable-length batch sizes, and we don't have to implement - | the dynamic oracle in CUDA to train. - -p - | Currently the parser's loss function is multilabel log loss#[+fn(6)], as - | the dynamic oracle allows multiple states to be 0 cost. This is defined - | as follows, where #[code gZ] is the sum of the scores assigned to gold - | classes: - -+code. - (exp(score) / Z) - (exp(score) / gZ) - -+bibliography - +item - | #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations] - br - | Eliyahu Kiperwasser, Yoav Goldberg. (2016) - - +item - | #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing] - br - | Yoav Goldberg, Joakim Nivre (2012) - - +item - | #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python] - br - | Matthew Honnibal (2013) - - +item - | #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax] - br - | Yuan Zhang, David Weiss (2016) - - +item - | #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers] - br - | Anders SΓΈgaard, Yoav Goldberg (2016) - - +item - | #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing] - br - | Matthew Honnibal, Mark Johnson (2015) - - +item - | #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks] - br - | Danqi Cheng, Christopher D. Manning (2014) - - +item - | #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques] - br - | Stefan Riezler et al. (2002) diff --git a/website/api/_cython/_doc.jade b/website/api/_cython/_doc.jade new file mode 100644 index 000000000..e54f09b70 --- /dev/null +++ b/website/api/_cython/_doc.jade @@ -0,0 +1,71 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES > DOC + +p + | The #[code Doc] object holds an array of + | #[+api("cython-structs#tokenc") #[code TokenC]] structs. + ++infobox + | This section documents the extra C-level attributes and methods that + | can't be accessed from Python. For the Python documentation, see + | #[+api("doc") #[code Doc]]. + ++h(3, "doc_attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code mem] + +cell #[code cymem.Pool] + +cell + | A memory pool. Allocated memory will be freed once the + | #[code Doc] object is garbage collected. + + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A reference to the shared #[code Vocab] object. + + +row + +cell #[code c] + +cell #[code TokenC*] + +cell + | A pointer to a #[+api("cython-structs#tokenc") #[code TokenC]] + | struct. + + +row + +cell #[code length] + +cell #[code int] + +cell The number of tokens in the document. + + +row + +cell #[code max_length] + +cell #[code int] + +cell The underlying size of the #[code Doc.c] array. + ++h(3, "doc_push_back") Doc.push_back + +tag method + +p + | Append a token to the #[code Doc]. The token can be provided as a + | #[+api("cython-structs#lexemec") #[code LexemeC]] or + | #[+api("cython-structs#tokenc") #[code TokenC]] pointer, using Cython's + | #[+a("http://cython.readthedocs.io/en/latest/src/userguide/fusedtypes.html") fused types]. + ++aside-code("Example"). + from spacy.tokens cimport Doc + from spacy.vocab cimport Vocab + + doc = Doc(Vocab()) + lexeme = doc.vocab.get(u'hello') + doc.push_back(lexeme, True) + assert doc.text == u'hello ' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code lex_or_tok] + +cell #[code LexemeOrToken] + +cell The word to append to the #[code Doc]. + + +row + +cell #[code has_space] + +cell #[code bint] + +cell Whether the word has trailing whitespace. diff --git a/website/api/_cython/_lexeme.jade b/website/api/_cython/_lexeme.jade new file mode 100644 index 000000000..219c7ec45 --- /dev/null +++ b/website/api/_cython/_lexeme.jade @@ -0,0 +1,30 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES > LEXEME + +p + | A Cython class providing access and methods for an entry in the + | vocabulary. + ++infobox + | This section documents the extra C-level attributes and methods that + | can't be accessed from Python. For the Python documentation, see + | #[+api("lexeme") #[code Lexeme]]. + ++h(3, "lexeme_attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code c] + +cell #[code LexemeC*] + +cell + | A pointer to a #[+api("cython-structs#lexemec") #[code LexemeC]] + | struct. + + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A reference to the shared #[code Vocab] object. + + +row + +cell #[code orth] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell ID of the verbatim text content. diff --git a/website/api/_cython/_lexemec.jade b/website/api/_cython/_lexemec.jade new file mode 100644 index 000000000..be574a235 --- /dev/null +++ b/website/api/_cython/_lexemec.jade @@ -0,0 +1,200 @@ +//- πŸ’« DOCS > API > CYTHON > STRUCTS > LEXEMEC + +p + | Struct holding information about a lexical type. #[code LexemeC] + | structs are usually owned by the #[code Vocab], and accessed through a + | read-only pointer on the #[code TokenC] struct. + ++aside-code("Example"). + lex = doc.c[3].lex + ++table(["Name", "Type", "Description"]) + +row + +cell #[code flags] + +cell #[+abbr("uint64_t") #[code flags_t]] + +cell Bit-field for binary lexical flag values. + + +row + +cell #[code id] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell + | Usually used to map lexemes to rows in a matrix, e.g. for word + | vectors. Does not need to be unique, so currently misnamed. + + +row + +cell #[code length] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Number of unicode characters in the lexeme. + + +row + +cell #[code orth] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell ID of the verbatim text content. + + +row + +cell #[code lower] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell ID of the lowercase form of the lexeme. + + +row + +cell #[code norm] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell ID of the lexeme's norm, i.e. a normalised form of the text. + + +row + +cell #[code shape] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Transform of the lexeme's string, to show orthographic features. + + +row + +cell #[code prefix] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell + | Length-N substring from the start of the lexeme. Defaults to + | #[code N=1]. + + +row + +cell #[code suffix] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell + | Length-N substring from the end of the lexeme. Defaults to + | #[code N=3]. + + +row + +cell #[code cluster] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Brown cluster ID. + + +row + +cell #[code prob] + +cell #[code float] + +cell Smoothed log probability estimate of the lexeme's type. + + +row + +cell #[code sentiment] + +cell #[code float] + +cell A scalar value indicating positivity or negativity. + ++h(3, "lexeme_get_struct_attr", "spacy/lexeme.pxd") Lexeme.get_struct_attr + +tag staticmethod + +tag nogil + +p Get the value of an attribute from the #[code LexemeC] struct by attribute ID. + ++aside-code("Example"). + from spacy.attrs cimport IS_ALPHA + from spacy.lexeme cimport Lexeme + + lexeme = doc.c[3].lex + is_alpha = Lexeme.get_struct_attr(lexeme, IS_ALPHA) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code lex] + +cell #[code const LexemeC*] + +cell A pointer to a #[code LexemeC] struct. + + +row + +cell #[code feat_name] + +cell #[code attr_id_t] + +cell + | The ID of the attribute to look up. The attributes are + | enumerated in #[code spacy.typedefs]. + + +row("foot") + +cell returns + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell The value of the attribute. + ++h(3, "lexeme_set_struct_attr", "spacy/lexeme.pxd") Lexeme.set_struct_attr + +tag staticmethod + +tag nogil + +p Set the value of an attribute of the #[code LexemeC] struct by attribute ID. + ++aside-code("Example"). + from spacy.attrs cimport NORM + from spacy.lexeme cimport Lexeme + + lexeme = doc.c[3].lex + Lexeme.set_struct_attr(lexeme, NORM, lexeme.lower) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code lex] + +cell #[code const LexemeC*] + +cell A pointer to a #[code LexemeC] struct. + + +row + +cell #[code feat_name] + +cell #[code attr_id_t] + +cell + | The ID of the attribute to look up. The attributes are + | enumerated in #[code spacy.typedefs]. + + +row + +cell #[code value] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell The value to set. + ++h(3, "lexeme_c_check_flag", "spacy/lexeme.pxd") Lexeme.c_check_flag + +tag staticmethod + +tag nogil + +p Check the value of a binary flag attribute. + ++aside-code("Example"). + from spacy.attrs cimport IS_STOP + from spacy.lexeme cimport Lexeme + + lexeme = doc.c[3].lex + is_stop = Lexeme.c_check_flag(lexeme, IS_STOP) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code lexeme] + +cell #[code const LexemeC*] + +cell A pointer to a #[code LexemeC] struct. + + +row + +cell #[code flag_id] + +cell #[code attr_id_t] + +cell + | The ID of the flag to look up. The flag IDs are enumerated in + | #[code spacy.typedefs]. + + +row("foot") + +cell returns + +cell #[code bint] + +cell The boolean value of the flag. + ++h(3, "lexeme_c_set_flag", "spacy/lexeme.pxd") Lexeme.c_set_flag + +tag staticmethod + +tag nogil + +p Set the value of a binary flag attribute. + ++aside-code("Example"). + from spacy.attrs cimport IS_STOP + from spacy.lexeme cimport Lexeme + + lexeme = doc.c[3].lex + Lexeme.c_set_flag(lexeme, IS_STOP, 0) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code lexeme] + +cell #[code const LexemeC*] + +cell A pointer to a #[code LexemeC] struct. + + +row + +cell #[code flag_id] + +cell #[code attr_id_t] + +cell + | The ID of the flag to look up. The flag IDs are enumerated in + | #[code spacy.typedefs]. + + +row + +cell #[code value] + +cell #[code bint] + +cell The value to set. diff --git a/website/api/_cython/_span.jade b/website/api/_cython/_span.jade new file mode 100644 index 000000000..0899081a7 --- /dev/null +++ b/website/api/_cython/_span.jade @@ -0,0 +1,43 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES > SPAN + +p + | A Cython class providing access and methods for a slice of a #[code Doc] + | object. + ++infobox + | This section documents the extra C-level attributes and methods that + | can't be accessed from Python. For the Python documentation, see + | #[+api("span") #[code Span]]. + ++h(3, "span_attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The parent document. + + +row + +cell #[code start] + +cell #[code int] + +cell The index of the first token of the span. + + +row + +cell #[code end] + +cell #[code int] + +cell The index of the first token after the span. + + +row + +cell #[code start_char] + +cell #[code int] + +cell The index of the first character of the span. + + +row + +cell #[code end_char] + +cell #[code int] + +cell The index of the last character of the span. + + +row + +cell #[code label] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell A label to attach to the span, e.g. for named entities. diff --git a/website/api/_cython/_stringstore.jade b/website/api/_cython/_stringstore.jade new file mode 100644 index 000000000..0fed5fd59 --- /dev/null +++ b/website/api/_cython/_stringstore.jade @@ -0,0 +1,23 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES > STRINGSTORE + +p A lookup table to retrieve strings by 64-bit hashes. + ++infobox + | This section documents the extra C-level attributes and methods that + | can't be accessed from Python. For the Python documentation, see + | #[+api("stringstore") #[code StringStore]]. + ++h(3, "stringstore_attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code mem] + +cell #[code cymem.Pool] + +cell + | A memory pool. Allocated memory will be freed once the + | #[code StringStore] object is garbage collected. + + +row + +cell #[code keys] + +cell #[+abbr("vector[uint64_t]") #[code vector[hash_t]]] + +cell A list of hash values in the #[code StringStore]. diff --git a/website/api/_cython/_token.jade b/website/api/_cython/_token.jade new file mode 100644 index 000000000..1ed2bc552 --- /dev/null +++ b/website/api/_cython/_token.jade @@ -0,0 +1,73 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES > TOKEN + +p + | A Cython class providing access and methods for a + | #[+api("cython-structs#tokenc") #[code TokenC]] struct. Note that the + | #[code Token] object does not own the struct. It only receives a pointer + | to it. + ++infobox + | This section documents the extra C-level attributes and methods that + | can't be accessed from Python. For the Python documentation, see + | #[+api("token") #[code Token]]. + ++h(3, "token_attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A reference to the shared #[code Vocab] object. + + +row + +cell #[code c] + +cell #[code TokenC*] + +cell + | A pointer to a #[+api("cython-structs#tokenc") #[code TokenC]] + | struct. + + +row + +cell #[code i] + +cell #[code int] + +cell The offset of the token within the document. + + +row + +cell #[code doc] + +cell #[code Doc] + +cell The parent document. + ++h(3, "token_cinit") Token.cinit + +tag method + +p Create a #[code Token] object from a #[code TokenC*] pointer. + ++aside-code("Example"). + token = Token.cinit(&doc.c[3], doc, 3) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A reference to the shared #[code Vocab]. + + +row + +cell #[code c] + +cell #[code TokenC*] + +cell + | A pointer to a #[+api("cython-structs#tokenc") #[code TokenC]] + | struct. + + +row + +cell #[code offset] + +cell #[code int] + +cell The offset of the token within the document. + + +row + +cell #[code doc] + +cell #[code Doc] + +cell The parent document. + + +row("foot") + +cell returns + +cell #[code Token] + +cell The newly constructed object. diff --git a/website/api/_cython/_tokenc.jade b/website/api/_cython/_tokenc.jade new file mode 100644 index 000000000..967012843 --- /dev/null +++ b/website/api/_cython/_tokenc.jade @@ -0,0 +1,270 @@ +//- πŸ’« DOCS > API > CYTHON > STRUCTS > TOKENC + +p + | Cython data container for the #[code Token] object. + ++aside-code("Example"). + token = &doc.c[3] + token_ptr = &doc.c[3] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code lex] + +cell #[code const LexemeC*] + +cell A pointer to the lexeme for the token. + + +row + +cell #[code morph] + +cell #[code uint64_t] + +cell An ID allowing lookup of morphological attributes. + + +row + +cell #[code pos] + +cell #[code univ_pos_t] + +cell Coarse-grained part-of-speech tag. + + +row + +cell #[code spacy] + +cell #[code bint] + +cell A binary value indicating whether the token has trailing whitespace. + + +row + +cell #[code tag] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Fine-grained part-of-speech tag. + + +row + +cell #[code idx] + +cell #[code int] + +cell The character offset of the token within the parent document. + + +row + +cell #[code lemma] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Base form of the token, with no inflectional suffixes. + + +row + +cell #[code sense] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Space for storing a word sense ID, currently unused. + + +row + +cell #[code head] + +cell #[code int] + +cell Offset of the syntactic parent relative to the token. + + +row + +cell #[code dep] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Syntactic dependency relation. + + +row + +cell #[code l_kids] + +cell #[code uint32_t] + +cell Number of left children. + + +row + +cell #[code r_kids] + +cell #[code uint32_t] + +cell Number of right children. + + +row + +cell #[code l_edge] + +cell #[code uint32_t] + +cell Offset of the leftmost token of this token's syntactic descendents. + + +row + +cell #[code r_edge] + +cell #[code uint32_t] + +cell Offset of the rightmost token of this token's syntactic descendents. + + +row + +cell #[code sent_start] + +cell #[code int] + +cell + | Ternary value indicating whether the token is the first word of + | a sentence. #[code 0] indicates a missing value, #[code -1] + | indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0, + | is interpretted as no sentence break. Sentence boundary detectors will usually + | set 0 for all tokens except tokens that follow a sentence boundary. + + +row + +cell #[code ent_iob] + +cell #[code int] + +cell + | IOB code of named entity tag. #[code 0] indicates a missing + | value, #[code 1] indicates #[code I], #[code 2] indicates + | #[code 0] and #[code 3] indicates #[code B]. + + +row + +cell #[code ent_type] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell Named entity type. + + +row + +cell #[code ent_id] + +cell #[+abbr("uint64_t") #[code hash_t]] + +cell + | ID of the entity the token is an instance of, if any. Currently + | not used, but potentially for coreference resolution. + ++h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr + +tag staticmethod + +tag nogil + +p Get the value of an attribute from the #[code TokenC] struct by attribute ID. + ++aside-code("Example"). + from spacy.attrs cimport IS_ALPHA + from spacy.tokens cimport Token + + is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code token] + +cell #[code const TokenC*] + +cell A pointer to a #[code TokenC] struct. + + +row + +cell #[code feat_name] + +cell #[code attr_id_t] + +cell + | The ID of the attribute to look up. The attributes are + | enumerated in #[code spacy.typedefs]. + + +row("foot") + +cell returns + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell The value of the attribute. + ++h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr + +tag staticmethod + +tag nogil + +p Set the value of an attribute of the #[code TokenC] struct by attribute ID. + ++aside-code("Example"). + from spacy.attrs cimport TAG + from spacy.tokens cimport Token + + token = &doc.c[3] + Token.set_struct_attr(token, TAG, 0) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code token] + +cell #[code const TokenC*] + +cell A pointer to a #[code TokenC] struct. + + +row + +cell #[code feat_name] + +cell #[code attr_id_t] + +cell + | The ID of the attribute to look up. The attributes are + | enumerated in #[code spacy.typedefs]. + + +row + +cell #[code value] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell The value to set. + ++h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start + +tag function + +p Find a token in a #[code TokenC*] array by the offset of its first character. + ++aside-code("Example"). + from spacy.tokens.doc cimport Doc, token_by_start + from spacy.vocab cimport Vocab + + doc = Doc(Vocab(), words=[u'hello', u'world']) + assert token_by_start(doc.c, doc.length, 6) == 1 + assert token_by_start(doc.c, doc.length, 4) == -1 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code tokens] + +cell #[code const TokenC*] + +cell A #[code TokenC*] array. + + +row + +cell #[code length] + +cell #[code int] + +cell The number of tokens in the array. + + +row + +cell #[code start_char] + +cell #[code int] + +cell The start index to search for. + + +row("foot") + +cell returns + +cell #[code int] + +cell The index of the token in the array or #[code -1] if not found. + ++h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end + +tag function + +p Find a token in a #[code TokenC*] array by the offset of its final character. + ++aside-code("Example"). + from spacy.tokens.doc cimport Doc, token_by_end + from spacy.vocab cimport Vocab + + doc = Doc(Vocab(), words=[u'hello', u'world']) + assert token_by_end(doc.c, doc.length, 5) == 0 + assert token_by_end(doc.c, doc.length, 1) == -1 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code tokens] + +cell #[code const TokenC*] + +cell A #[code TokenC*] array. + + +row + +cell #[code length] + +cell #[code int] + +cell The number of tokens in the array. + + +row + +cell #[code end_char] + +cell #[code int] + +cell The end index to search for. + + +row("foot") + +cell returns + +cell #[code int] + +cell The index of the token in the array or #[code -1] if not found. + ++h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads + +tag function + +p + | Set attributes that allow lookup of syntactic children on a + | #[code TokenC*] array. This function must be called after making changes + | to the #[code TokenC.head] attribute, in order to make the parse tree + | navigation consistent. + ++aside-code("Example"). + from spacy.tokens.doc cimport Doc, set_children_from_heads + from spacy.vocab cimport Vocab + + doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe']) + doc.c[0].head = 0 + doc.c[1].head = 0 + doc.c[2].head = 3 + doc.c[3].head = 1 + set_children_from_heads(doc.c, doc.length) + assert doc.c[3].l_kids == 1 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code tokens] + +cell #[code const TokenC*] + +cell A #[code TokenC*] array. + + +row + +cell #[code length] + +cell #[code int] + +cell The number of tokens in the array. diff --git a/website/api/_cython/_vocab.jade b/website/api/_cython/_vocab.jade new file mode 100644 index 000000000..b0b975577 --- /dev/null +++ b/website/api/_cython/_vocab.jade @@ -0,0 +1,88 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES > VOCAB + +p + | A Cython class providing access and methods for a vocabulary and other + | data shared across a language. + ++infobox + | This section documents the extra C-level attributes and methods that + | can't be accessed from Python. For the Python documentation, see + | #[+api("vocab") #[code Vocab]]. + ++h(3, "vocab_attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code mem] + +cell #[code cymem.Pool] + +cell + | A memory pool. Allocated memory will be freed once the + | #[code Vocab] object is garbage collected. + + +row + +cell #[code strings] + +cell #[code StringStore] + +cell + | A #[code StringStore] that maps string to hash values and vice + | versa. + + +row + +cell #[code length] + +cell #[code int] + +cell The number of entries in the vocabulary. + ++h(3, "vocab_get") Vocab.get + +tag method + +p + | Retrieve a #[+api("cython-structs#lexemec") #[code LexemeC*]] pointer + | from the vocabulary. + ++aside-code("Example"). + lexeme = vocab.get(vocab.mem, u'hello') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code mem] + +cell #[code cymem.Pool] + +cell + | A memory pool. Allocated memory will be freed once the + | #[code Vocab] object is garbage collected. + + +row + +cell #[code string] + +cell #[code unicode] + +cell The string of the word to look up. + + +row("foot") + +cell returns + +cell #[code const LexemeC*] + +cell The lexeme in the vocabulary. + ++h(3, "vocab_get_by_orth") Vocab.get_by_orth + +tag method + +p + | Retrieve a #[+api("cython-structs#lexemec") #[code LexemeC*]] pointer + | from the vocabulary. + ++aside-code("Example"). + lexeme = vocab.get_by_orth(doc[0].lex.norm) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code mem] + +cell #[code cymem.Pool] + +cell + | A memory pool. Allocated memory will be freed once the + | #[code Vocab] object is garbage collected. + + +row + +cell #[code orth] + +cell #[+abbr("uint64_t") #[code attr_t]] + +cell ID of the verbatim text content. + + +row("foot") + +cell returns + +cell #[code const LexemeC*] + +cell The lexeme in the vocabulary. diff --git a/website/api/_data.json b/website/api/_data.json index 67b9debf0..6e269f074 100644 --- a/website/api/_data.json +++ b/website/api/_data.json @@ -33,6 +33,12 @@ "Vectors": "vectors", "GoldParse": "goldparse", "GoldCorpus": "goldcorpus" + }, + + "Cython": { + "Architecture": "cython", + "Structs": "cython-structs", + "Classes": "cython-classes" } }, @@ -41,8 +47,7 @@ "next": "annotation", "menu": { "Basics": "basics", - "Neural Network Model": "nn-model", - "Cython Conventions": "cython" + "Neural Network Model": "nn-model" } }, @@ -211,5 +216,36 @@ "Named Entities": "named-entities", "Models & Training": "training" } + }, + + "cython": { + "title": "Cython Architecture", + "next": "cython-structs", + "menu": { + "Overview": "overview", + "Conventions": "conventions" + } + }, + + "cython-structs": { + "title": "Cython Structs", + "teaser": "C-language objects that let you group variables together in a single contiguous block.", + "next": "cython-classes", + "menu": { + "TokenC": "tokenc", + "LexemeC": "lexemec" + } + }, + + "cython-classes": { + "title": "Cython Classes", + "menu": { + "Doc": "doc", + "Token": "token", + "Span": "span", + "Lexeme": "lexeme", + "Vocab": "vocab", + "StringStore": "stringstore" + } } } diff --git a/website/api/cython-classes.jade b/website/api/cython-classes.jade new file mode 100644 index 000000000..9d7d8cc53 --- /dev/null +++ b/website/api/cython-classes.jade @@ -0,0 +1,39 @@ +//- πŸ’« DOCS > API > CYTHON > CLASSES + +include ../_includes/_mixins + ++section("doc") + +h(2, "doc", "spacy/tokens/doc.pxd") Doc + +tag cdef class + + include _cython/_doc + ++section("token") + +h(2, "token", "spacy/tokens/token.pxd") Token + +tag cdef class + + include _cython/_token + ++section("span") + +h(2, "span", "spacy/tokens/span.pxd") Span + +tag cdef class + + include _cython/_span + ++section("lexeme") + +h(2, "lexeme", "spacy/lexeme.pxd") Lexeme + +tag cdef class + + include _cython/_lexeme + ++section("vocab") + +h(2, "vocab", "spacy/vocab.pxd") Vocab + +tag cdef class + + include _cython/_vocab + ++section("stringstore") + +h(2, "stringstore", "spacy/strings.pxd") StringStore + +tag cdef class + + include _cython/_stringstore diff --git a/website/api/cython-structs.jade b/website/api/cython-structs.jade new file mode 100644 index 000000000..bdaf01ac4 --- /dev/null +++ b/website/api/cython-structs.jade @@ -0,0 +1,15 @@ +//- πŸ’« DOCS > API > CYTHON > STRUCTS + +include ../_includes/_mixins + ++section("tokenc") + +h(2, "tokenc", "spacy/structs.pxd") TokenC + +tag C struct + + include _cython/_tokenc + ++section("lexemec") + +h(2, "lexemec", "spacy/structs.pxd") LexemeC + +tag C struct + + include _cython/_lexemec diff --git a/website/api/cython.jade b/website/api/cython.jade new file mode 100644 index 000000000..5a1853046 --- /dev/null +++ b/website/api/cython.jade @@ -0,0 +1,176 @@ +//- πŸ’« DOCS > API > CYTHON > ARCHITECTURE + +include ../_includes/_mixins + ++section("overview") + +aside("What's Cython?") + | #[+a("http://cython.org/") Cython] is a language for writing + | C extensions for Python. Most Python code is also valid Cython, but + | you can add type declarations to get efficient memory-managed code + | just like C or C++. + + p + | This section documents spaCy's C-level data structures and + | interfaces, intended for use from Cython. Some of the attributes are + | primarily for internal use, and all C-level functions and methods are + | designed for speed over safety – if you make a mistake and access an + | array out-of-bounds, the program may crash abruptly. + + p + | With Cython there are four ways of declaring complex data types. + | Unfortunately we use all four in different places, as they all have + | different utility: + + +table(["Declaration", "Description", "Example"]) + +row + +cell #[code class] + +cell A normal Python class. + +cell #[+api("language") #[code Language]] + + +row + +cell #[code cdef class] + +cell + | A Python extension type. Differs from a normal Python class + | in that its attributes can be defined on the underlying + | struct. Can have C-level objects as attributes (notably + | structs and pointers), and can have methods which have + | C-level objects as arguments or return types. + +cell #[+api("cython-classes#lexeme") #[code Lexeme]] + + +row + +cell #[code cdef struct] + +cell + | A struct is just a collection of variables, sort of like a + | named tuple, except the memory is contiguous. Structs can't + | have methods, only attributes. + +cell #[+api("cython-structs#lexemec") #[code LexemeC]] + + +row + +cell #[code cdef cppclass] + +cell + | A C++ class. Like a struct, this can be allocated on the + | stack, but can have methods, a constructor and a destructor. + | Differs from `cdef class` in that it can be created and + | destroyed without acquiring the Python global interpreter + | lock. This style is the most obscure. + +cell #[+src(gh("spacy", "spacy/syntax/_state.pxd")) #[code StateC]] + + p + | The most important classes in spaCy are defined as #[code cdef class] + | objects. The underlying data for these objects is usually gathered + | into a struct, which is usually named #[code c]. For instance, the + | #[+api("cython-classses#lexeme") #[code Lexeme]] class holds a + | #[+api("cython-structs#lexemec") #[code LexemeC]] struct, at + | #[code Lexeme.c]. This lets you shed the Python container, and pass + | a pointer to the underlying data into C-level functions. + ++section("conventions") + +h(2, "conventions") Conventions + + p + | spaCy's core data structures are implemented as + | #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is + | managed through the #[+a(gh("cymem")) #[code cymem]] + | #[code cymem.Pool] class, which allows you + | to allocate memory which will be freed when the #[code Pool] object + | is garbage collected. This means you usually don't have to worry + | about freeing memory. You just have to decide which Python object + | owns the memory, and make it own the #[code Pool]. When that object + | goes out of scope, the memory will be freed. You do have to take + | care that no pointers outlive the object that owns them β€” but this + | is generally quite easy. + + p + | All Cython modules should have the #[code # cython: infer_types=True] + | compiler directive at the top of the file. This makes the code much + | cleaner, as it avoids the need for many type declarations. If + | possible, you should prefer to declare your functions #[code nogil], + | even if you don't especially care about multi-threading. The reason + | is that #[code nogil] functions help the Cython compiler reason about + | your code quite a lot β€” you're telling the compiler that no Python + | dynamics are possible. This lets many errors be raised, and ensures + | your function will run at C speed. + + + p + | Cython gives you many choices of sequences: you could have a Python + | list, a numpy array, a memory view, a C++ vector, or a pointer. + | Pointers are preferred, because they are fastest, have the most + | explicit semantics, and let the compiler check your code more + | strictly. C++ vectors are also great β€” but you should only use them + | internally in functions. It's less friendly to accept a vector as an + | argument, because that asks the user to do much more work. Here's + | how to get a pointer from a numpy array, memory view or vector: + + +code. + cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil: + pointer1 = <int*>numpy_array.data + pointer2 = cpp_vector.data() + pointer3 = &memory_view[0] + + p + | Both C arrays and C++ vectors reassure the compiler that no Python + | operations are possible on your variable. This is a big advantage: + | it lets the Cython compiler raise many more errors for you. + + p + | When getting a pointer from a numpy array or memoryview, take care + | that the data is actually stored in C-contiguous order β€” otherwise + | you'll get a pointer to nonsense. The type-declarations in the code + | above should generate runtime errors if buffers with incorrect + | memory layouts are passed in. To iterate over the array, the + | following style is preferred: + + +code. + cdef int c_total(const int* int_array, int length) nogil: + total = 0 + for item in int_array[:length]: + total += item + return total + + p + | If this is confusing, consider that the compiler couldn't deal with + | #[code for item in int_array:] β€” there's no length attached to a raw + | pointer, so how could we figure out where to stop? The length is + | provided in the slice notation as a solution to this. Note that we + | don't have to declare the type of #[code item] in the code above β€” + | the compiler can easily infer it. This gives us tidy code that looks + | quite like Python, but is exactly as fast as C β€” because we've made + | sure the compilation to C is trivial. + + p + | Your functions cannot be declared #[code nogil] if they need to + | create Python objects or call Python functions. This is perfectly + | okay β€” you shouldn't torture your code just to get #[code nogil] + | functions. However, if your function isn't #[code nogil], you should + | compile your module with #[code cython -a --cplus my_module.pyx] and + | open the resulting #[code my_module.html] file in a browser. This + | will let you see how Cython is compiling your code. Calls into the + | Python run-time will be in bright yellow. This lets you easily see + | whether Cython is able to correctly type your code, or whether there + | are unexpected problems. + + p + | Working in Cython is very rewarding once you're over the initial + | learning curve. As with C and C++, the first way you write something + | in Cython will often be the performance-optimal approach. In + | contrast, Python optimisation generally requires a lot of + | experimentation. Is it faster to have an #[code if item in my_dict] + | check, or to use #[code .get()]? What about + | #[code try]/#[code except]? Does this numpy operation create a copy? + | There's no way to guess the answers to these questions, and you'll + | usually be dissatisfied with your results β€” so there's no way to + | know when to stop this process. In the worst case, you'll make a + | mess that invites the next reader to try their luck too. This is + | like one of those + | #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps], + | where the rescuers keep passing out from low oxygen, causing + | another rescuer to follow β€” only to succumb themselves. In short, + | just say no to optimizing your Python. If it's not fast enough the + | first time, just switch to Cython. + + +infobox("Resources") + +list.o-no-block + +item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org) + +item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai) + +item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai) diff --git a/website/api/index.jade b/website/api/index.jade index 8035c9ff5..441d185e5 100644 --- a/website/api/index.jade +++ b/website/api/index.jade @@ -7,8 +7,151 @@ include ../_includes/_mixins +section("nn-model") +h(2, "nn-model") Neural network model architecture - include _architecture/_nn-model -+section("cython") - +h(2, "cython") Cython conventions - include _architecture/_cython + p + | spaCy's statistical models have been custom-designed to give a + | high-performance mix of speed and accuracy. The current architecture + | hasn't been published yet, but in the meantime we prepared a video that + | explains how the models work, with particular focus on NER. + + +youtube("sqDHBH9IjRU") + + p + | The parsing model is a blend of recent results. The two recent + | inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at + | Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of + | the parser is still based on the work of Joakim Nivre#[+fn(2)], who + | introduced the transition-based framework#[+fn(3)], the arc-eager + | transition system, and the imitation learning objective. The model is + | implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning + | library. We first predict context-sensitive vectors for each word in the + | input: + + +code. + (embed_lower | embed_prefix | embed_suffix | embed_shape) + >> Maxout(token_width) + >> convolution ** 4 + + p + | This convolutional layer is shared between the tagger, parser and NER, + | and will also be shared by the future neural lemmatizer. Because the + | parser shares these layers with the tagger, the parser does not require + | tag features. I got this trick from David Weiss's "Stack Combination" + | paper#[+fn(4)]. + + p + | To boost the representation, the tagger actually predicts a "super tag" + | with POS, morphology and dependency label#[+fn(5)]. The tagger predicts + | these supertags by adding a softmax layer onto the convolutional layer – + | so, we're teaching the convolutional layer to give us a representation + | that's one affine transform from this informative lexical information. + | This is obviously good for the parser (which backprops to the + | convolutions too). The parser model makes a state vector by concatenating + | the vector representations for its context tokens. The current context + | tokens: + + +table + +row + +cell #[code S0], #[code S1], #[code S2] + +cell Top three words on the stack. + + +row + +cell #[code B0], #[code B1] + +cell First two words of the buffer. + + +row + +cell + | #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1], + | #[code B1L1]#[br] + | #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2], + | #[code B1L2] + +cell + | Leftmost and second leftmost children of #[code S0], #[code S1], + | #[code S2], #[code B0] and #[code B1]. + + +row + +cell + | #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1], + | #[code B1R1]#[br] + | #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2], + | #[code B1R2] + +cell + | Rightmost and second rightmost children of #[code S0], #[code S1], + | #[code S2], #[code B0] and #[code B1]. + + p + | This makes the state vector quite long: #[code 13*T], where #[code T] is + | the token vector width (128 is working well). Fortunately, there's a way + | to structure the computation to save some expense (and make it more + | GPU-friendly). + + p + | The parser typically visits #[code 2*N] states for a sentence of length + | #[code N] (although it may visit more, if it back-tracks with a + | non-monotonic transition#[+fn(4)]). A naive implementation would require + | #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of + | size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)] + | multiplication, to pre-compute the hidden weights for each positional + | feature with respect to the words in the batch. (Note that our token + | vectors come from the CNN β€” so we can't play this trick over the + | vocabulary. That's how Stanford's NN parser#[+fn(3)] works β€” and why its + | model is so big.) + + p + | This pre-computation strategy allows a nice compromise between + | GPU-friendliness and implementation simplicity. The CNN and the wide + | lower layer are computed on the GPU, and then the precomputed hidden + | weights are moved to the CPU, before we start the transition-based + | parsing process. This makes a lot of things much easier. We don't have to + | worry about variable-length batch sizes, and we don't have to implement + | the dynamic oracle in CUDA to train. + + p + | Currently the parser's loss function is multilabel log loss#[+fn(6)], as + | the dynamic oracle allows multiple states to be 0 cost. This is defined + | as follows, where #[code gZ] is the sum of the scores assigned to gold + | classes: + + +code. + (exp(score) / Z) - (exp(score) / gZ) + + +bibliography + +item + | #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations] + br + | Eliyahu Kiperwasser, Yoav Goldberg. (2016) + + +item + | #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing] + br + | Yoav Goldberg, Joakim Nivre (2012) + + +item + | #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python] + br + | Matthew Honnibal (2013) + + +item + | #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax] + br + | Yuan Zhang, David Weiss (2016) + + +item + | #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers] + br + | Anders SΓΈgaard, Yoav Goldberg (2016) + + +item + | #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing] + br + | Matthew Honnibal, Mark Johnson (2015) + + +item + | #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks] + br + | Danqi Cheng, Christopher D. Manning (2014) + + +item + | #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques] + br + | Stefan Riezler et al. (2002) diff --git a/website/api/token.jade b/website/api/token.jade index ca237acc6..924336481 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -573,15 +573,15 @@ p The L2 norm of the token's vector representation. +cell #[code ent_id] +cell int +cell - | ID of the entity the token is an instance of, if any. Usually - | assigned by patterns in the Matcher. + | ID of the entity the token is an instance of, if any. Currently + | not used, but potentially for coreference resolution. +row +cell #[code ent_id_] +cell unicode +cell - | ID of the entity the token is an instance of, if any. Usually - | assigned by patterns in the Matcher. + | ID of the entity the token is an instance of, if any. Currently + | not used, but potentially for coreference resolution. +row +cell #[code lemma] diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index f54667c3f..ef463870e 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -231,3 +231,19 @@ border: none text-align-last: center width: 100% + +//- Abbreviations + +.o-abbr + +breakpoint(min, md) + cursor: help + border-bottom: 2px dotted $color-theme + padding-bottom: 3px + + +breakpoint(max, sm) + &[data-tooltip]:before + content: none + + &:after + content: " (" attr(aria-label) ")" + color: $color-subtle-dark diff --git a/website/assets/js/main.js b/website/assets/js/main.js index 3295dddd6..2ae1a4691 100644 --- a/website/assets/js/main.js +++ b/website/assets/js/main.js @@ -58,7 +58,12 @@ import initUniverse from './universe.vue.js'; const sectionAttr = 'data-section'; const navAttr = 'data-nav'; const activeClass = 'is-active'; + const sidebarAttr = 'data-sidebar-active'; const sections = [...document.querySelectorAll(`[${navAttr}]`)]; + const currentItem = document.querySelector(`[${sidebarAttr}]`); + if (currentItem && Element.prototype.scrollIntoView) { + currentItem.scrollIntoView(); + } if (window.inView) { if (sections.length) { // highlight first item regardless sections[0].classList.add(activeClass); @@ -69,6 +74,9 @@ import initUniverse from './universe.vue.js'; if (el) { sections.forEach(el => el.classList.remove(activeClass)); el.classList.add(activeClass); + if (Element.prototype.scrollIntoView) { + el.scrollIntoView(); + } } }); } diff --git a/website/assets/js/vendor/prism.min.js b/website/assets/js/vendor/prism.min.js index d5f80f603..00fc29aab 100644 --- a/website/assets/js/vendor/prism.min.js +++ b/website/assets/js/vendor/prism.min.js @@ -16,7 +16,7 @@ Prism.languages.json={property:/".*?"(?=\s*:)/gi,string:/"(?!:)(\\?[^"])*?"(?!:) !function(a){var e=/\\([^a-z()[\]]|[a-z\*]+)/i,n={"equation-command":{pattern:e,alias:"regex"}};a.languages.latex={comment:/%.*/m,cdata:{pattern:/(\\begin\{((?:verbatim|lstlisting)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0},equation:[{pattern:/\$(?:\\?[\w\W])*?\$|\\\((?:\\?[\w\W])*?\\\)|\\\[(?:\\?[\w\W])*?\\\]/,inside:n,alias:"string"},{pattern:/(\\begin\{((?:equation|math|eqnarray|align|multline|gather)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0,inside:n,alias:"string"}],keyword:{pattern:/(\\(?:begin|end|ref|cite|label|usepackage|documentclass)(?:\[[^\]]+\])?\{)[^}]+(?=\})/,lookbehind:!0},url:{pattern:/(\\url\{)[^}]+(?=\})/,lookbehind:!0},headline:{pattern:/(\\(?:part|chapter|section|subsection|frametitle|subsubsection|paragraph|subparagraph|subsubparagraph|subsubsubparagraph)\*?(?:\[[^\]]+\])?\{)[^}]+(?=\}(?:\[[^\]]+\])?)/,lookbehind:!0,alias:"class-name"},"function":{pattern:e,alias:"selector"},punctuation:/[[\]{}&]/}}(Prism); Prism.languages.makefile={comment:{pattern:/(^|[^\\])#(?:\\(?:\r\n|[\s\S])|.)*/,lookbehind:!0},string:/(["'])(?:\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,builtin:/\.[A-Z][^:#=\s]+(?=\s*:(?!=))/,symbol:{pattern:/^[^:=\r\n]+(?=\s*:(?!=))/m,inside:{variable:/\$+(?:[^(){}:#=\s]+|(?=[({]))/}},variable:/\$+(?:[^(){}:#=\s]+|\([@*%<^+?][DF]\)|(?=[({]))/,keyword:[/-include\b|\b(?:define|else|endef|endif|export|ifn?def|ifn?eq|include|override|private|sinclude|undefine|unexport|vpath)\b/,{pattern:/(\()(?:addsuffix|abspath|and|basename|call|dir|error|eval|file|filter(?:-out)?|findstring|firstword|flavor|foreach|guile|if|info|join|lastword|load|notdir|or|origin|patsubst|realpath|shell|sort|strip|subst|suffix|value|warning|wildcard|word(?:s|list)?)(?=[ \t])/,lookbehind:!0}],operator:/(?:::|[?:+!])?=|[|@]/,punctuation:/[:;(){}]/}; Prism.languages.markdown=Prism.languages.extend("markup",{}),Prism.languages.insertBefore("markdown","prolog",{blockquote:{pattern:/^>(?:[\t ]*>)*/m,alias:"punctuation"},code:[{pattern:/^(?: {4}|\t).+/m,alias:"keyword"},{pattern:/``.+?``|`[^`\n]+`/,alias:"keyword"}],title:[{pattern:/\w+.*(?:\r?\n|\r)(?:==+|--+)/,alias:"important",inside:{punctuation:/==+$|--+$/}},{pattern:/(^\s*)#+.+/m,lookbehind:!0,alias:"important",inside:{punctuation:/^#+|#+$/}}],hr:{pattern:/(^\s*)([*-])([\t ]*\2){2,}(?=\s*$)/m,lookbehind:!0,alias:"punctuation"},list:{pattern:/(^\s*)(?:[*+-]|\d+\.)(?=[\t ].)/m,lookbehind:!0,alias:"punctuation"},"url-reference":{pattern:/!?\[[^\]]+\]:[\t ]+(?:\S+|<(?:\\.|[^>\\])+>)(?:[\t ]+(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\)))?/,inside:{variable:{pattern:/^(!?\[)[^\]]+/,lookbehind:!0},string:/(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\))$/,punctuation:/^[\[\]!:]|[<>]/},alias:"url"},bold:{pattern:/(^|[^\\])(\*\*|__)(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^\*\*|^__|\*\*$|__$/}},italic:{pattern:/(^|[^\\])([*_])(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^[*_]|[*_]$/}},url:{pattern:/!?\[[^\]]+\](?:\([^\s)]+(?:[\t ]+"(?:\\.|[^"\\])*")?\)| ?\[[^\]\n]*\])/,inside:{variable:{pattern:/(!?\[)[^\]]+(?=\]$)/,lookbehind:!0},string:{pattern:/"(?:\\.|[^"\\])*"(?=\)$)/}}}}),Prism.languages.markdown.bold.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.italic.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.bold.inside.italic=Prism.util.clone(Prism.languages.markdown.italic),Prism.languages.markdown.italic.inside.bold=Prism.util.clone(Prism.languages.markdown.bold); -Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False|None)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/,"constant":/\b[A-Z_]{2,}\b/}; +Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield|cimport)\b/,"boolean":/\b(?:True|False|None)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/,"constant":/\b[A-Z_]{2,}\b/}; Prism.languages.rest={table:[{pattern:/(\s*)(?:\+[=-]+)+\+(?:\r?\n|\r)(?:\1(?:[+|].+)+[+|](?:\r?\n|\r))+\1(?:\+[=-]+)+\+/,lookbehind:!0,inside:{punctuation:/\||(?:\+[=-]+)+\+/}},{pattern:/(\s*)(?:=+ +)+=+((?:\r?\n|\r)\1.+)+(?:\r?\n|\r)\1(?:=+ +)+=+(?=(?:\r?\n|\r){2}|\s*$)/,lookbehind:!0,inside:{punctuation:/[=-]+/}}],"substitution-def":{pattern:/(^\s*\.\. )\|(?:[^|\s](?:[^|]*[^|\s])?)\| [^:]+::/m,lookbehind:!0,inside:{substitution:{pattern:/^\|(?:[^|\s]|[^|\s][^|]*[^|\s])\|/,alias:"attr-value",inside:{punctuation:/^\||\|$/}},directive:{pattern:/( +)[^:]+::/,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}}}},"link-target":[{pattern:/(^\s*\.\. )\[[^\]]+\]/m,lookbehind:!0,alias:"string",inside:{punctuation:/^\[|\]$/}},{pattern:/(^\s*\.\. )_(?:`[^`]+`|(?:[^:\\]|\\.)+):/m,lookbehind:!0,alias:"string",inside:{punctuation:/^_|:$/}}],directive:{pattern:/(^\s*\.\. )[^:]+::/m,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}},comment:{pattern:/(^\s*\.\.)(?:(?: .+)?(?:(?:\r?\n|\r).+)+| .+)(?=(?:\r?\n|\r){2}|$)/m,lookbehind:!0},title:[{pattern:/^(([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+)(?:\r?\n|\r).+(?:\r?\n|\r)\1$/m,inside:{punctuation:/^[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+|[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}},{pattern:/(^|(?:\r?\n|\r){2}).+(?:\r?\n|\r)([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+(?=\r?\n|\r|$)/,lookbehind:!0,inside:{punctuation:/[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}}],hr:{pattern:/((?:\r?\n|\r){2})([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2{3,}(?=(?:\r?\n|\r){2})/,lookbehind:!0,alias:"punctuation"},field:{pattern:/(^\s*):[^:\r\n]+:(?= )/m,lookbehind:!0,alias:"attr-name"},"command-line-option":{pattern:/(^\s*)(?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?(?:, (?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?)*(?=(?:\r?\n|\r)? {2,}\S)/im,lookbehind:!0,alias:"symbol"},"literal-block":{pattern:/::(?:\r?\n|\r){2}([ \t]+).+(?:(?:\r?\n|\r)\1.+)*/,inside:{"literal-block-punctuation":{pattern:/^::/,alias:"punctuation"}}},"quoted-literal-block":{pattern:/::(?:\r?\n|\r){2}([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]).*(?:(?:\r?\n|\r)\1.*)*/,inside:{"literal-block-punctuation":{pattern:/^(?:::|([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\1*)/m,alias:"punctuation"}}},"list-bullet":{pattern:/(^\s*)(?:[*+\-‒‣⁃]|\(?(?:\d+|[a-z]|[ivxdclm]+)\)|(?:\d+|[a-z]|[ivxdclm]+)\.)(?= )/im,lookbehind:!0,alias:"punctuation"},"doctest-block":{pattern:/(^\s*)>>> .+(?:(?:\r?\n|\r).+)*/m,lookbehind:!0,inside:{punctuation:/^>>>/}},inline:[{pattern:/(^|[\s\-:\/'"<(\[{])(?::[^:]+:`.*?`|`.*?`:[^:]+:|(\*\*?|``?|\|)(?!\s).*?[^\s]\2(?=[\s\-.,:;!?\\\/'")\]}]|$))/m,lookbehind:!0,inside:{bold:{pattern:/(^\*\*).+(?=\*\*$)/,lookbehind:!0},italic:{pattern:/(^\*).+(?=\*$)/,lookbehind:!0},"inline-literal":{pattern:/(^``).+(?=``$)/,lookbehind:!0,alias:"symbol"},role:{pattern:/^:[^:]+:|:[^:]+:$/,alias:"function",inside:{punctuation:/^:|:$/}},"interpreted-text":{pattern:/(^`).+(?=`$)/,lookbehind:!0,alias:"attr-value"},substitution:{pattern:/(^\|).+(?=\|$)/,lookbehind:!0,alias:"attr-value"},punctuation:/\*\*?|``?|\|/}}],link:[{pattern:/\[[^\]]+\]_(?=[\s\-.,:;!?\\\/'")\]}]|$)/,alias:"string",inside:{punctuation:/^\[|\]_$/}},{pattern:/(?:\b[a-z\d](?:[_.:+]?[a-z\d]+)*_?_|`[^`]+`_?_|_`[^`]+`)(?=[\s\-.,:;!?\\\/'")\]}]|$)/i,alias:"string",inside:{punctuation:/^_?`|`$|`?_?_$/}}],punctuation:{pattern:/(^\s*)(?:\|(?= |$)|(?:---?|β€”|\.\.|__)(?= )|\.\.$)/m,lookbehind:!0}}; !function(e){e.languages.sass=e.languages.extend("css",{comment:{pattern:/^([ \t]*)\/[\/*].*(?:(?:\r?\n|\r)\1[ \t]+.+)*/m,lookbehind:!0}}),e.languages.insertBefore("sass","atrule",{"atrule-line":{pattern:/^(?:[ \t]*)[@+=].+/m,inside:{atrule:/(?:@[\w-]+|[+=])/m}}}),delete e.languages.sass.atrule;var a=/((\$[-_\w]+)|(#\{\$[-_\w]+\}))/i,t=[/[+*\/%]|[=!]=|<=?|>=?|\b(?:and|or|not)\b/,{pattern:/(\s+)-(?=\s)/,lookbehind:!0}];e.languages.insertBefore("sass","property",{"variable-line":{pattern:/^[ \t]*\$.+/m,inside:{punctuation:/:/,variable:a,operator:t}},"property-line":{pattern:/^[ \t]*(?:[^:\s]+ *:.*|:[^:\s]+.*)/m,inside:{property:[/[^:\s]+(?=\s*:)/,{pattern:/(:)[^:\s]+/,lookbehind:!0}],punctuation:/:/,variable:a,operator:t,important:e.languages.sass.important}}}),delete e.languages.sass.property,delete e.languages.sass.important,delete e.languages.sass.selector,e.languages.insertBefore("sass","punctuation",{selector:{pattern:/([ \t]*)\S(?:,?[^,\r\n]+)*(?:,(?:\r?\n|\r)\1[ \t]+\S(?:,?[^,\r\n]+)*)*/,lookbehind:!0}})}(Prism); Prism.languages.scss=Prism.languages.extend("css",{comment:{pattern:/(^|[^\\])(?:\/\*[\w\W]*?\*\/|\/\/.*)/,lookbehind:!0},atrule:{pattern:/@[\w-]+(?:\([^()]+\)|[^(])*?(?=\s+[{;])/,inside:{rule:/@[\w-]+/}},url:/(?:[-a-z]+-)*url(?=\()/i,selector:{pattern:/(?=\S)[^@;\{\}\(\)]?([^@;\{\}\(\)]|&|#\{\$[-_\w]+\})+(?=\s*\{(\}|\s|[^\}]+(:|\{)[^\}]+))/m,inside:{placeholder:/%[-_\w]+/}}}),Prism.languages.insertBefore("scss","atrule",{keyword:[/@(?:if|else(?: if)?|for|each|while|import|extend|debug|warn|mixin|include|function|return|content)/i,{pattern:/( +)(?:from|through)(?= )/,lookbehind:!0}]}),Prism.languages.insertBefore("scss","property",{variable:/\$[-_\w]+|#\{\$[-_\w]+\}/}),Prism.languages.insertBefore("scss","function",{placeholder:{pattern:/%[-_\w]+/,alias:"selector"},statement:/\B!(?:default|optional)\b/i,"boolean":/\b(?:true|false)\b/,"null":/\bnull\b/,operator:{pattern:/(\s)(?:[-+*\/%]|[=!]=|<=?|>=?|and|or|not)(?=\s)/,lookbehind:!0}}),Prism.languages.scss.atrule.inside.rest=Prism.util.clone(Prism.languages.scss);