Customization
While we've chosen sensible defaults, you can add custom processing functions to the DocumentParser
class to further process the extracted data.
from openparse import processing, Node
from typing import List
class CustomCombineTables(processing.ProcessingStep):
"""
Let's combine tables that are next to each other
"""
def process(self, nodes: List[Node]) -> List[Node]:
new_nodes = []
print("Combining concurrent tables")
for i in range(len(nodes) - 1):
if "table" in nodes[i].variant and "table" in nodes[i + 1].variant:
new_node = nodes[i] + nodes[i + 1]
new_nodes.append(new_node)
else:
new_nodes.append(nodes[i])
return new_nodes
# add a custom processing step to the pipeline
custom_pipeline = processing.BasicIngestionPipeline()
custom_pipeline.append_transform(CustomCombineTables())
parser = openparse.DocumentParser(
table_args={"parsing_algorithm": "pymupdf"}, processing_pipeline=custom_pipeline
)
custom_10k = parser.parse(meta10k_path)
Or you can create your own custom processing pipeline from scratch by extending the IngestionPipeline
class.
from openparse import processing, Node
from typing import List
class BasicIngestionPipeline(processing.IngestionPipeline):
"""
A basic pipeline for ingesting and processing Nodes.
"""
def __init__(self):
self.transformations = [
processing.RemoveTextInsideTables(),
processing.RemoveFullPageStubs(max_area_pct=0.35),
]