Document regex utilities [ci skip]

2025-08-23 21:44:54 +03:00 · 2019-02-24 18:34:10 +01:00 · 2019-02-24 18:34:10 +01:00 · 1ea1bc98e7
commit 1ea1bc98e7
parent cd4bc6757b
2 changed files with 66 additions and 0 deletions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -315,6 +315,11 @@ def read_regex(path):


 def compile_prefix_regex(entries):
+    """Compile a list of prefix rules into a regex object.
+
+    entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
+    RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
+    """
    if "(" in entries:
        # Handle deprecated data
        expression = "|".join(
@ -327,11 +332,21 @@ def compile_prefix_regex(entries):


 def compile_suffix_regex(entries):
+    """Compile a list of suffix rules into a regex object.
+
+    entries (tuple): The suffix rules, e.g. spacy.lang.punctuation.TOKENIZER_SUFFIXES.
+    RETURNS (regex object): The regex object. to be used for Tokenizer.suffix_search.
+    """
    expression = "|".join([piece + "$" for piece in entries if piece.strip()])
    return re.compile(expression)


 def compile_infix_regex(entries):
+    """Compile a list of infix rules into a regex object.
+
+    entries (tuple): The infix rules, e.g. spacy.lang.punctuation.TOKENIZER_INFIXES.
+    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
+    """
    expression = "|".join([piece for piece in entries if piece.strip()])
    return re.compile(expression)

--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -504,6 +504,57 @@ an error if key doesn't match `ORTH` values.
 | `*addition_dicts` | dicts | Exception dictionaries to add to the base exceptions, in order. |
 | **RETURNS**       | dict  | Combined tokenizer exceptions.                                  |

+### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"}
+
+Compile a sequence of prefix rules into a regex object.
+
+> #### Example
+>
+> ```python
+> prefixes = ("§", "%", "=", r"\+")
+> prefix_regex = util.compile_prefix_regex(prefixes)
+> nlp.tokenizer.prefix_search = prefix_regex.search
+> ```
+
+| Name        | Type                                                          | Description                                                                                                                               |
+| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| `entries`   | tuple                                                         | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
+| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes).                                                  |
+
+### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
+
+Compile a sequence of suffix rules into a regex object.
+
+> #### Example
+>
+> ```python
+> suffixes = ("'s", "'S", r"(?<=[0-9])\+")
+> suffix_regex = util.compile_suffix_regex(suffixes)
+> nlp.tokenizer.suffix_search = suffix_regex.search
+> ```
+
+| Name        | Type                                                          | Description                                                                                                                               |
+| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| `entries`   | tuple                                                         | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
+| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes).                                                  |
+
+### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
+
+Compile a sequence of infix rules into a regex object.
+
+> #### Example
+>
+> ```python
+> infixes = ("…", "-", "—", r"(?<=[0-9])[+\-\*^](?=[0-9-])")
+> infix_regex = util.compile_infix_regex(infixes)
+> nlp.tokenizer.infix_finditer = infix_regex.finditer
+> ```
+
+| Name        | Type                                                          | Description                                                                                                                             |
+| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
+| `entries`   | tuple                                                         | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
+| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes).                                               |
+
 ### util.minibatch {#util.minibatch tag="function" new="2"}

 Iterate over batches of items. `size` may be an iterator, so that batch-size can