pdf-table-extractor/pdf_table_extractor.py

164 lines
5.6 KiB
Python
Raw Normal View History

2024-04-27 21:23:40 +00:00
import sys
import traceback
from typing import Dict, List, Optional, Sequence, Tuple, TypeAlias
2024-04-28 13:16:30 +00:00
import PIL.Image
2024-04-27 21:23:40 +00:00
import gi
2024-04-27 21:29:33 +00:00
2024-04-27 21:23:40 +00:00
gi.require_version("Gtk", "3.0")
from gi.repository import GLib, Gio, Gtk, GObject
2024-04-27 21:29:33 +00:00
2024-04-27 21:23:40 +00:00
gi.require_foreign("cairo")
import cairo
import fitz
TEST_FILENAME = "/home/luna/Documents/Resources/Praca Licencjacka/sources/2018_Torres-Benitez_Metabolomic analysis Parmotrema.pdf"
Coords: TypeAlias = Tuple[float, float]
2024-04-27 21:29:33 +00:00
2024-04-27 21:23:40 +00:00
class Selection:
def __init__(self, bounds: Tuple[Coords, Coords], columns: Optional[Sequence[float]] = None):
self.bounds = bounds
self.columns = columns
2024-04-27 21:29:33 +00:00
2024-04-27 21:23:40 +00:00
class Document:
def __init__(self, filename: str):
self.filename = filename
self.document = fitz.Document(filename)
self.selections: Dict[int, List[Selection]] = {}
2024-04-27 21:29:33 +00:00
2024-04-27 21:23:40 +00:00
class State():
pass
2024-04-27 21:29:33 +00:00
2024-04-27 21:23:40 +00:00
class PdfPage(Gtk.DrawingArea):
def __init__(self, page, *args, **kwargs):
super().__init__(*args, **kwargs)
self.page: fitz.Page = page
2024-04-27 21:29:33 +00:00
pix = self.page.get_pixmap(dpi=96) # type: ignore
2024-04-27 21:23:40 +00:00
self.set_size_request(pix.width, pix.height)
2024-04-27 21:29:33 +00:00
self.set_app_paintable(True) # type: ignore
2024-04-27 21:23:40 +00:00
self.connect("draw", self.on_draw, {})
2024-04-27 21:29:33 +00:00
def on_draw(self, widget: Gtk.DrawingArea, cr: cairo.Context, data: GObject.GPointer):
# app: Application = widget.get_window().get_application() # type: ignore
width = widget.get_allocated_width()
height = widget.get_allocated_height()
2024-04-28 13:06:23 +00:00
2024-04-27 21:29:33 +00:00
sctx = widget.get_style_context()
2024-04-27 21:23:40 +00:00
Gtk.render_background(sctx, cr, 0, 0, width, height)
2024-04-28 13:16:30 +00:00
pix = self.page.get_pixmap(dpi=96) # type: ignore
img = PIL.Image.frombytes("RGBA" if pix.alpha else "RGB", [pix.width, pix.height], pix.samples)
img.putalpha(1)
mv: memoryview = memoryview(bytearray(img.tobytes()))
2024-04-27 21:29:33 +00:00
ims = cairo.ImageSurface.create_for_data(mv, cairo.Format.RGB24, pix.width, pix.height)
2024-04-27 21:23:40 +00:00
cr.set_source_surface(ims, 0, 0)
cr.paint()
@Gtk.Template.from_file("MainWindow.glade")
class MainWindow(Gtk.ApplicationWindow):
__gtype_name__ = "main_window"
2024-04-27 21:29:33 +00:00
open_button: Gtk.Button = Gtk.Template.Child() # type: ignore
header_bar: Gtk.HeaderBar = Gtk.Template.Child() # type: ignore
main_paned: Gtk.Paned = Gtk.Template.Child() # type: ignore
2024-04-27 21:23:40 +00:00
2024-04-27 21:29:33 +00:00
pdf_list_box: Gtk.ListBox = Gtk.Template.Child() # type: ignore
2024-04-27 21:23:40 +00:00
2024-04-27 21:29:33 +00:00
pdfFileFilter: Gtk.FileFilter = Gtk.Template.Child() # type: ignore
2024-04-27 21:23:40 +00:00
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
2024-04-27 21:29:33 +00:00
self.app: Application = self.get_application() # type: ignore
2024-04-27 21:23:40 +00:00
assert self.app is not None
self.app.connect("notify::document", self.on_document_updated)
2024-04-27 21:29:33 +00:00
# self.pdf_list_box.add(PdfPage())
2024-04-27 21:23:40 +00:00
2024-04-27 21:29:33 +00:00
# @Gtk.Template.Callback()
# def example_button_released_cb(self, widget: Gtk.Button, **kwargs):
2024-04-27 21:23:40 +00:00
# assert self.example_button == widget
# print(widget.get_label())
# widget.set_label("woah")
@Gtk.Template.Callback()
def on_open_button_clicked(self, widget, *args, **kwargs):
dialog = Gtk.FileChooserDialog(
title="Choose PDF File to open",
2024-04-27 21:29:33 +00:00
transient_for=self, # equivalent to parent=
2024-04-27 21:23:40 +00:00
action=Gtk.FileChooserAction.OPEN,
filter=self.pdfFileFilter,
modal=True,
)
dialog.add_button("Cancel", Gtk.ResponseType.CANCEL)
dialog.add_button("Open", Gtk.ResponseType.ACCEPT)
2024-04-27 21:29:33 +00:00
response = dialog.run() # type: ignore
2024-04-27 21:23:40 +00:00
if response == Gtk.ResponseType.ACCEPT:
2024-04-27 21:29:33 +00:00
filename: str = dialog.get_filename() # type: ignore
2024-04-27 21:23:40 +00:00
try:
self.app.set_property("document", Document(filename))
except Exception as e:
message_dialog = Gtk.MessageDialog(
title="An error has occured.",
transient_for=self,
modal=True,
message_type=Gtk.MessageType.ERROR,
text=repr(e),
secondary_text=traceback.format_exc(),
buttons=Gtk.ButtonsType.OK,
)
2024-04-27 21:29:33 +00:00
message_dialog.run() # type: ignore
2024-04-27 21:23:40 +00:00
message_dialog.destroy()
dialog.destroy()
# Note: this won't run unless a new document object is put into place
# editing an existing one won't trigger it
def on_document_updated(self, recvobj, gparamstring):
document: Document = self.app.get_property("document")
2024-04-27 21:29:33 +00:00
self.header_bar.set_title(document.filename.split("/")[-1]) # type: ignore
self.header_bar.set_subtitle(document.filename) # type: ignore
2024-04-27 21:23:40 +00:00
2024-04-27 21:29:33 +00:00
for i in document.document.pages(): # type: ignore
row = Gtk.ListBoxRow()
2024-04-27 21:23:40 +00:00
page = PdfPage(i)
2024-04-27 21:29:33 +00:00
row.add(page)
self.pdf_list_box.add(row) # type: ignore
self.pdf_list_box.show_all()
2024-04-27 21:23:40 +00:00
@Gtk.Template.Callback()
def on_open_button_small_clicked(self, widget, **kwargs):
pass
class Application(Gtk.Application):
document = GObject.Property(type=GObject.TYPE_PYOBJECT, flags=GObject.ParamFlags.READWRITE)
def __init__(self, *args, **kwargs):
super().__init__(
*args,
application_id="zone.lunareclipse.pdf_table_extractor",
flags=Gio.ApplicationFlags.FLAGS_NONE,
2024-04-27 21:29:33 +00:00
# flags=Gio.ApplicationFlags.HANDLES_COMMAND_LINE, # TODO
2024-04-27 21:23:40 +00:00
**kwargs
)
self.window = None
def do_activate(self):
self.window = self.window or MainWindow(application=self)
2024-04-27 21:29:33 +00:00
self.window.show_all() # type: ignore
2024-04-27 21:23:40 +00:00
if __name__ == "__main__":
app = Application()
app.run(sys.argv)