mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	DocBin: add version number, missing attributes and strings (#5685)
* Add version number to DocBin Add a version number to DocBin for future use. * Add POS to all attributes in DocBin * Add morph string to strings in DocBin * Update DocBin API * Add string for ENT_KB_ID in DocBin
This commit is contained in:
		
							parent
							
								
									b5268955d7
								
							
						
					
					
						commit
						a723fa02a1
					
				| 
						 | 
					@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
 | 
					ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DocBin(object):
 | 
					class DocBin(object):
 | 
				
			||||||
| 
						 | 
					@ -31,6 +31,7 @@ class DocBin(object):
 | 
				
			||||||
        "spaces": bytes, # Serialized numpy boolean array with spaces data
 | 
					        "spaces": bytes, # Serialized numpy boolean array with spaces data
 | 
				
			||||||
        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
 | 
					        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
 | 
				
			||||||
        "strings": List[unicode] # List of unique strings in the token data
 | 
					        "strings": List[unicode] # List of unique strings in the token data
 | 
				
			||||||
 | 
					        "version": str, # DocBin version number
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Strings for the words, tags, labels etc are represented by 64-bit hashes in
 | 
					    Strings for the words, tags, labels etc are represented by 64-bit hashes in
 | 
				
			||||||
| 
						 | 
					@ -53,6 +54,7 @@ class DocBin(object):
 | 
				
			||||||
        DOCS: https://spacy.io/api/docbin#init
 | 
					        DOCS: https://spacy.io/api/docbin#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        attrs = sorted([intify_attr(attr) for attr in attrs])
 | 
					        attrs = sorted([intify_attr(attr) for attr in attrs])
 | 
				
			||||||
 | 
					        self.version = "0.1"
 | 
				
			||||||
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
 | 
					        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
 | 
				
			||||||
        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
 | 
					        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
 | 
				
			||||||
        self.tokens = []
 | 
					        self.tokens = []
 | 
				
			||||||
| 
						 | 
					@ -87,8 +89,10 @@ class DocBin(object):
 | 
				
			||||||
            self.strings.add(token.text)
 | 
					            self.strings.add(token.text)
 | 
				
			||||||
            self.strings.add(token.tag_)
 | 
					            self.strings.add(token.tag_)
 | 
				
			||||||
            self.strings.add(token.lemma_)
 | 
					            self.strings.add(token.lemma_)
 | 
				
			||||||
 | 
					            self.strings.add(token.morph_)
 | 
				
			||||||
            self.strings.add(token.dep_)
 | 
					            self.strings.add(token.dep_)
 | 
				
			||||||
            self.strings.add(token.ent_type_)
 | 
					            self.strings.add(token.ent_type_)
 | 
				
			||||||
 | 
					            self.strings.add(token.ent_kb_id_)
 | 
				
			||||||
        self.cats.append(doc.cats)
 | 
					        self.cats.append(doc.cats)
 | 
				
			||||||
        if self.store_user_data:
 | 
					        if self.store_user_data:
 | 
				
			||||||
            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
 | 
					            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
 | 
				
			||||||
| 
						 | 
					@ -147,6 +151,7 @@ class DocBin(object):
 | 
				
			||||||
        spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
 | 
					        spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        msg = {
 | 
					        msg = {
 | 
				
			||||||
 | 
					            "version": self.version,
 | 
				
			||||||
            "attrs": self.attrs,
 | 
					            "attrs": self.attrs,
 | 
				
			||||||
            "tokens": tokens.tobytes("C"),
 | 
					            "tokens": tokens.tobytes("C"),
 | 
				
			||||||
            "spaces": spaces.tobytes("C"),
 | 
					            "spaces": spaces.tobytes("C"),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
 | 
				
			||||||
the msgpack object has the following structure:
 | 
					the msgpack object has the following structure:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
### msgpack object strcutrue
 | 
					### msgpack object structrue
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					    "version": str,           # DocBin version number
 | 
				
			||||||
    "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
 | 
					    "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
 | 
				
			||||||
    "tokens": bytes,          # Serialized numpy uint64 array with the token data
 | 
					    "tokens": bytes,          # Serialized numpy uint64 array with the token data
 | 
				
			||||||
    "spaces": bytes,          # Serialized numpy boolean array with spaces data
 | 
					    "spaces": bytes,          # Serialized numpy boolean array with spaces data
 | 
				
			||||||
| 
						 | 
					@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument          | Type     | Description                                                                                                                                                                                |
 | 
					| Argument          | Type     | Description                                                                                                                                                                                |
 | 
				
			||||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
 | 
					| `attrs`           | list     | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
 | 
				
			||||||
| `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 |
 | 
					| `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 |
 | 
				
			||||||
| **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              |
 | 
					| **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user