pdf-table-extractor/pdf_table_extractor.py

218 lines
7.6 KiB
Python

import sys
import traceback
from typing import Dict, List, Optional, Sequence, Tuple, TypeAlias
import PIL.Image
import gi
gi.require_version("Gtk", "3.0")
from gi.repository import GLib, Gio, Gtk, GObject
gi.require_foreign("cairo")
import cairo
import fitz
Coords: TypeAlias = Tuple[float, float]
class Selection:
def __init__(self, bounds: Tuple[Coords, Coords], columns: Optional[Sequence[float]] = None):
self.bounds = bounds
self.columns: list[float] = list(columns or [])
class Page:
def __init__(self, index: int, raw: fitz.Page):
self.index = index
self.raw = raw
self.selections: list[Selection] = []
class Document:
def __init__(self, filename: str):
self.filename = filename
self.raw = fitz.Document(filename)
self.pages = []
for i, p in enumerate(self.raw.pages()): # type: ignore
page = Page(index=i, raw=p)
self.pages.append(page)
class PdfPage(Gtk.DrawingArea):
def __init__(self, page, *args, **kwargs):
super().__init__(*args, **kwargs)
self.page: Page = page
pix: fitz.Pixmap = self.page.raw.get_pixmap(dpi=50) # type: ignore
self.set_size_request(pix.width, pix.height)
self.connect("draw", self.on_draw, {})
if self.page.index == 4:
self.page.selections.append(Selection(
((0.1, 0.18), (0.9, 0.72)),
columns=[0.08, 0.24, 0.34, 0.42, 0.51, 0.59, 0.67, 0.73, 0.91]
))
def on_draw(self, widget: Gtk.DrawingArea, cr: cairo.Context, data: GObject.GPointer):
width = widget.get_allocated_width()
height = widget.get_allocated_height()
sctx = widget.get_style_context()
Gtk.render_background(sctx, cr, 0, 0, width, height)
pix: fitz.Pixmap = self.page.raw.get_pixmap(dpi=50) # type: ignore
img = PIL.Image.frombytes("RGBA" if pix.alpha else "RGB", [pix.width, pix.height], pix.samples)
img.putalpha(1)
img = PIL.Image.merge("RGBA", (lambda r, g, b, a: (b, g, r, a))(*img.split())) # type: ignore
mv: memoryview = memoryview(bytearray(img.tobytes()))
ims = cairo.ImageSurface.create_for_data(mv, cairo.Format.RGB24, pix.width, pix.height)
cr.set_source_surface(ims, 0, 0)
cr.paint()
for sel in self.page.selections:
sel_x1 = sel.bounds[0][0] * pix.width
sel_y1 = sel.bounds[0][1] * pix.height
sel_x2 = sel.bounds[1][0] * pix.width
sel_y2 = sel.bounds[1][1] * pix.height
# Base settings
cr.set_line_cap(cairo.LINE_CAP_BUTT)
cr.set_line_width(2)
# Columns (draw first - below selection)
cr.set_dash([5])
cr.set_source_rgba(1, 0, 0)
for col in sel.columns:
col_x = sel_x1 + (sel_x2 - sel_x1) * col
cr.move_to(col_x, sel_y1)
cr.line_to(col_x, sel_y2)
cr.stroke()
# Selection
cr.rectangle(sel_x1, sel_y1, sel_x2 - sel_x1, sel_y2 - sel_y1)
# White part of the pattern
cr.set_source_rgba(1, 1, 1)
cr.set_dash([5], 5)
cr.stroke_preserve() # important preserve - reuse rectangle
# Black part of the pattern
cr.set_source_rgba(0, 0, 0)
cr.set_dash([5])
cr.stroke()
@Gtk.Template.from_file("MainWindow.glade")
class MainWindow(Gtk.ApplicationWindow):
__gtype_name__ = "main_window"
open_button: Gtk.Button = Gtk.Template.Child() # type: ignore
header_bar: Gtk.HeaderBar = Gtk.Template.Child() # type: ignore
main_paned: Gtk.Paned = Gtk.Template.Child() # type: ignore
pdf_list_box: Gtk.ListBox = Gtk.Template.Child() # type: ignore
pdfFileFilter: Gtk.FileFilter = Gtk.Template.Child() # type: ignore
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.app: Application = self.get_application() # type: ignore
assert self.app is not None
self.app.connect("notify::document", self.on_document_updated)
# @Gtk.Template.Callback()
# def example_button_released_cb(self, widget: Gtk.Button, **kwargs):
# assert self.example_button == widget
# print(widget.get_label())
# widget.set_label("woah")
@Gtk.Template.Callback()
def on_open_button_clicked(self, widget, *args, **kwargs):
dialog = Gtk.FileChooserDialog(
title="Choose PDF File to open",
transient_for=self, # equivalent to parent=
action=Gtk.FileChooserAction.OPEN,
filter=self.pdfFileFilter,
modal=True,
)
dialog.add_button("Cancel", Gtk.ResponseType.CANCEL)
dialog.add_button("Open", Gtk.ResponseType.ACCEPT)
response = dialog.run() # type: ignore
if response == Gtk.ResponseType.ACCEPT:
filename: str = dialog.get_filename() # type: ignore
try:
self.app.set_property("document", Document(filename))
except Exception as e:
message_dialog = Gtk.MessageDialog(
title="An error has occured.",
transient_for=self,
modal=True,
message_type=Gtk.MessageType.ERROR,
text=repr(e),
secondary_text=traceback.format_exc(),
buttons=Gtk.ButtonsType.OK,
)
message_dialog.run() # type: ignore
message_dialog.destroy()
dialog.destroy()
# Note: this won't run unless a new document object is put into place
# editing an existing one won't trigger it
def on_document_updated(self, recvobj, gparamstring):
document: Document = self.app.get_property("document")
self.header_bar.set_title(document.filename.split("/")[-1])
self.header_bar.set_subtitle(document.filename)
for child in self.pdf_list_box.get_children():
if type(child) is Gtk.ListBoxRow:
child.destroy()
for i in document.pages:
self.pdf_list_box.add(PdfPage(i))
self.pdf_list_box.show_all()
@Gtk.Template.Callback()
def on_open_button_small_clicked(self, widget, **kwargs):
TEST_FILENAME = "/home/luna/Documents/Resources/Praca Licencjacka/sources/2018_Torres-Benitez_Metabolomic analysis Parmotrema.pdf"
try:
self.app.set_property("document", Document(TEST_FILENAME))
except Exception as e:
message_dialog = Gtk.MessageDialog(
title="An error has occured.",
transient_for=self,
modal=True,
message_type=Gtk.MessageType.ERROR,
text=repr(e),
secondary_text=traceback.format_exc(),
buttons=Gtk.ButtonsType.OK,
)
message_dialog.run() # type: ignore
message_dialog.destroy()
class Application(Gtk.Application):
document = GObject.Property(type=GObject.TYPE_PYOBJECT, flags=GObject.ParamFlags.READWRITE)
def __init__(self, *args, **kwargs):
super().__init__(
*args,
application_id="zone.lunareclipse.pdf_table_extractor",
flags=Gio.ApplicationFlags.FLAGS_NONE,
# flags=Gio.ApplicationFlags.HANDLES_COMMAND_LINE, # TODO
**kwargs
)
self.window = None
def do_activate(self):
self.window = self.window or MainWindow(application=self)
self.window.show_all() # type: ignore
if __name__ == "__main__":
app = Application()
app.run(sys.argv)