From 00828d4c084d74e02440e51f58fe3f2ca3162cfe Mon Sep 17 00:00:00 2001 From: LunarEclipse Date: Sat, 27 Apr 2024 23:23:40 +0200 Subject: [PATCH] Initial Commit --- MainWindow.glade | 553 ++++++++++++++++++++++ README.md | 1 + experiments/draw_test.py | 67 +++ experiments/main.py | 17 + experiments/old_pdf_table_extractor_qt.py | 54 +++ experiments/qt6_test.py | 41 ++ experiments/test.py | 84 ++++ pdf_table_extractor.py | 160 +++++++ pyproject.toml | 22 + 9 files changed, 999 insertions(+) create mode 100644 MainWindow.glade create mode 100644 README.md create mode 100644 experiments/draw_test.py create mode 100644 experiments/main.py create mode 100644 experiments/old_pdf_table_extractor_qt.py create mode 100644 experiments/qt6_test.py create mode 100644 experiments/test.py create mode 100644 pdf_table_extractor.py create mode 100644 pyproject.toml diff --git a/MainWindow.glade b/MainWindow.glade new file mode 100644 index 0000000..d7f44ae --- /dev/null +++ b/MainWindow.glade @@ -0,0 +1,553 @@ + + + + + + + + application/pdf + + + diff --git a/README.md b/README.md new file mode 100644 index 0000000..7db5c82 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# Silly :3 diff --git a/experiments/draw_test.py b/experiments/draw_test.py new file mode 100644 index 0000000..a46a4b3 --- /dev/null +++ b/experiments/draw_test.py @@ -0,0 +1,67 @@ +import sys +import traceback +from typing import Dict, List, Optional, Sequence, Tuple, TypeAlias + +import gi +gi.require_version("Gtk", "3.0") +from gi.repository import GLib, Gio, Gtk, GObject +gi.require_foreign("cairo") +import cairo + + +class CustomDrawingArea(Gtk.DrawingArea): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.set_size_request(200, 200) + self.set_app_paintable(True) + self.connect_after("draw", self.on_draw, {}) + + def on_draw(self, widget: Gtk.DrawingArea, cr: cairo.Context, data: GObject.GPointer): + width = widget.get_allocated_width() + height = widget.get_allocated_height() + print(f"w: {width}, h: {height}") + + sctx = widget.get_style_context() + Gtk.render_background(sctx, cr, 0, 0, width, height) + + cr.set_source_rgba(1.0, 0.0, 0.0, 1.0) + cr.rectangle(0, 0, width, height) + cr.fill() + + +class MainWindow(Gtk.ApplicationWindow): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.app: Application = self.get_application() # type: ignore + assert self.app is not None + + box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL) + self.add(box) + + self.area = CustomDrawingArea() + frame = Gtk.Frame(label="DrawingArea") + frame.add(self.area) + box.pack_start(frame, expand=True, fill=True, padding=20) + box.add(Gtk.Button(label=":)")) + + +class Application(Gtk.Application): + def __init__(self, *args, **kwargs): + super().__init__( + *args, + application_id="zone.lunareclipse.draw_test", + flags=Gio.ApplicationFlags.FLAGS_NONE, + **kwargs + ) + self.window = None + + def do_activate(self): + self.window = self.window or MainWindow(application=self) + self.window.show_all() + + +if __name__ == "__main__": + app = Application() + app.run(sys.argv) diff --git a/experiments/main.py b/experiments/main.py new file mode 100644 index 0000000..bc14b6b --- /dev/null +++ b/experiments/main.py @@ -0,0 +1,17 @@ +import argparse +import tabula + + +if __name__ == "__main__": + #parser = argparse.ArgumentParser() + #parser.add_argument("filename") + #args = parser.parse_args() + + table = tabula.io.read_pdf( + "../sources/2018_Torres-Benitez_Metabolomic analysis Parmotrema.pdf", + pages=[5], + lattice=False, + multiple_tables=True, + ) + print(table) + print("test") diff --git a/experiments/old_pdf_table_extractor_qt.py b/experiments/old_pdf_table_extractor_qt.py new file mode 100644 index 0000000..67d35a3 --- /dev/null +++ b/experiments/old_pdf_table_extractor_qt.py @@ -0,0 +1,54 @@ +import sys +from typing import Dict, List, Optional, Sequence, Tuple, TypeAlias + +from PySide6.QtWidgets import QApplication, QDialog, QHBoxLayout, QLabel, QMainWindow, QPushButton, QVBoxLayout, QWidget +from PySide6.QtCore import Slot +from PySide6.QtPdf import QPdfDocument +from PySide6.QtPdfWidgets import QPdfView +import fitz + +TEST_FILENAME = "/home/luna/Documents/Resources/Praca Licencjacka/sources/2018_Torres-Benitez_Metabolomic analysis Parmotrema.pdf" + +Coords: TypeAlias = Tuple[float, float] + +class Selection: + def __init__(self, bounds: Tuple[Coords, Coords], columns: Optional[Sequence[float]] = None): + self.bounds = bounds + self.columns = columns + +class Document: + def __init__(self, filename: str): + self.filename = filename + self.document = fitz.Document(filename) + self.selections: Dict[int, List[Selection]] = {} + +class SelectablePdfView(QPdfView): + pass + +class State(): + pass + +class Frontend(): + def __init__(self, argv: Sequence[str]): + self.app = QApplication(argv) + self.window = QMainWindow() + self.state = State() + + self.thumbnails = QWidget() + self.pdfDocument = QPdfDocument() + self.optionsPanel = QWidget() + + central_widget = QWidget() + cw_layout = QHBoxLayout(central_widget) + cw_layout.addWidget(self.thumbnails) + cw_layout.addWidget(self.optionsPanel) + self.window.setCentralWidget(central_widget) + + + def exec(self): + self.window.show() + self.app.exec() + +if __name__ == "__main__": + app = Frontend(sys.argv) + app.exec() diff --git a/experiments/qt6_test.py b/experiments/qt6_test.py new file mode 100644 index 0000000..7b5b264 --- /dev/null +++ b/experiments/qt6_test.py @@ -0,0 +1,41 @@ +import sys +from typing import Sequence + +from PySide6.QtWidgets import QApplication, QDialog, QLabel, QPushButton, QVBoxLayout +from PySide6.QtCore import Slot + +class Application(): + def __init__(self, argv: Sequence[str]): + self.app = QApplication(argv) + self.counter = 0 + self.window = QDialog() + self.layout = QVBoxLayout(self.window) + self.label = QLabel("0") + self.button_increment = QPushButton("Increment counter!") + self.button_decrement = QPushButton("Decrement counter!") + + self.window.setWindowTitle("PDF Table Extractor") + self.layout.addWidget(self.label) + self.layout.addWidget(self.button_increment) + self.layout.addWidget(self.button_decrement) + self.button_increment.clicked.connect(self.increment) + self.button_decrement.clicked.connect(self.decrement) + print(self.window.layout()) + + @Slot() + def increment(self): + self.counter += 1 + self.label.setText(f"{self.counter}") + + @Slot() + def decrement(self): + self.counter -= 1 + self.label.setText(f"{self.counter}") + + def exec(self): + self.window.show() + self.app.exec() + +if __name__ == "__main__": + app = Application(sys.argv) + app.exec() diff --git a/experiments/test.py b/experiments/test.py new file mode 100644 index 0000000..2edd57e --- /dev/null +++ b/experiments/test.py @@ -0,0 +1,84 @@ +import gi + +gi.require_version("Gtk", "3.0") +from gi.repository import Gtk + + +class FileChooserWindow(Gtk.Window): + def __init__(self): + super().__init__(title="FileChooser Example") + + box = Gtk.Box(spacing=6) + self.add(box) + + button1 = Gtk.Button(label="Choose File") + button1.connect("clicked", self.on_file_clicked) + box.add(button1) + + button2 = Gtk.Button(label="Choose Folder") + button2.connect("clicked", self.on_folder_clicked) + box.add(button2) + + def on_file_clicked(self, widget): + dialog = Gtk.FileChooserDialog( + title="Please choose a file", parent=self, action=Gtk.FileChooserAction.OPEN + ) + dialog.add_buttons( + Gtk.STOCK_CANCEL, + Gtk.ResponseType.CANCEL, + Gtk.STOCK_OPEN, + Gtk.ResponseType.OK, + ) + + self.add_filters(dialog) + + response = dialog.run() + if response == Gtk.ResponseType.OK: + print("Open clicked") + print("File selected: " + dialog.get_filename()) + elif response == Gtk.ResponseType.CANCEL: + print("Cancel clicked") + + dialog.destroy() + + def add_filters(self, dialog): + filter_text = Gtk.FileFilter() + filter_text.set_name("Text files") + filter_text.add_mime_type("text/plain") + dialog.add_filter(filter_text) + + filter_py = Gtk.FileFilter() + filter_py.set_name("Python files") + filter_py.add_mime_type("text/x-python") + dialog.add_filter(filter_py) + + filter_any = Gtk.FileFilter() + filter_any.set_name("Any files") + filter_any.add_pattern("*") + dialog.add_filter(filter_any) + + def on_folder_clicked(self, widget): + dialog = Gtk.FileChooserDialog( + title="Please choose a folder", + parent=self, + action=Gtk.FileChooserAction.SELECT_FOLDER, + ) + dialog.add_buttons( + Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, "Select", Gtk.ResponseType.OK + ) + dialog.set_default_size(800, 400) + + response = dialog.run() + if response == Gtk.ResponseType.OK: + print("Select clicked") + print("Folder selected: " + dialog.get_filename()) + elif response == Gtk.ResponseType.CANCEL: + print("Cancel clicked") + + dialog.destroy() + + +win = FileChooserWindow() +win.connect("destroy", Gtk.main_quit) +win.show_all() +Gtk.main() diff --git a/pdf_table_extractor.py b/pdf_table_extractor.py new file mode 100644 index 0000000..47f815c --- /dev/null +++ b/pdf_table_extractor.py @@ -0,0 +1,160 @@ +import sys +import traceback +from typing import Dict, List, Optional, Sequence, Tuple, TypeAlias + +import gi +gi.require_version("Gtk", "3.0") +from gi.repository import GLib, Gio, Gtk, GObject +gi.require_foreign("cairo") +import cairo + +import fitz + +TEST_FILENAME = "/home/luna/Documents/Resources/Praca Licencjacka/sources/2018_Torres-Benitez_Metabolomic analysis Parmotrema.pdf" + +Coords: TypeAlias = Tuple[float, float] + +class Selection: + def __init__(self, bounds: Tuple[Coords, Coords], columns: Optional[Sequence[float]] = None): + self.bounds = bounds + self.columns = columns + +class Document: + def __init__(self, filename: str): + self.filename = filename + self.document = fitz.Document(filename) + self.selections: Dict[int, List[Selection]] = {} + +class State(): + pass + +class PdfPage(Gtk.DrawingArea): + def __init__(self, page, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.page: fitz.Page = page + pix = self.page.get_pixmap(dpi=300) # type: ignore + self.set_size_request(pix.width, pix.height) + self.set_app_paintable(True) # type: ignore + self.connect("draw", self.on_draw, {}) + + def on_draw(self, widget, cr: cairo.Context, data: GObject.GPointer): + #app: Application = widget.get_window().get_application() # type: ignore + width = self.get_allocated_width() + height = self.get_allocated_height() + + sctx = self.get_style_context() + Gtk.render_background(sctx, cr, 0, 0, width, height) + + pix = self.page.get_pixmap(dpi=300) # type: ignore + png = pix.tobytes("png") + ims = cairo.ImageSurface.create_from_png(png) + cr.set_source_surface(ims, 0, 0) + cr.paint() + + cr.set_source_rgba(255, 0, 0) + cr.set_line_width(10) + cr.move_to(0, 0) + cr.line_to(width, height) + cr.stroke() + + +@Gtk.Template.from_file("MainWindow.glade") +class MainWindow(Gtk.ApplicationWindow): + __gtype_name__ = "main_window" + + open_button: Gtk.Button = Gtk.Template.Child() # type: ignore + header_bar: Gtk.HeaderBar = Gtk.Template.Child() # type: ignore + main_paned: Gtk.Paned = Gtk.Template.Child() # type: ignore + + pdf_list_box: Gtk.ListBox = Gtk.Template.Child() # type: ignore + + pdfFileFilter: Gtk.FileFilter = Gtk.Template.Child() # type: ignore + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.app: Application = self.get_application() # type: ignore + assert self.app is not None + + self.app.connect("notify::document", self.on_document_updated) + + #self.pdf_list_box.add(PdfPage()) + + + #@Gtk.Template.Callback() + #def example_button_released_cb(self, widget: Gtk.Button, **kwargs): + # assert self.example_button == widget + # print(widget.get_label()) + # widget.set_label("woah") + + @Gtk.Template.Callback() + def on_open_button_clicked(self, widget, *args, **kwargs): + dialog = Gtk.FileChooserDialog( + title="Choose PDF File to open", + transient_for=self, # equivalent to parent= + action=Gtk.FileChooserAction.OPEN, + filter=self.pdfFileFilter, + modal=True, + ) + dialog.add_button("Cancel", Gtk.ResponseType.CANCEL) + dialog.add_button("Open", Gtk.ResponseType.ACCEPT) + + response = dialog.run() # type: ignore + if response == Gtk.ResponseType.ACCEPT: + filename: str = dialog.get_filename() # type: ignore + try: + self.app.set_property("document", Document(filename)) + except Exception as e: + message_dialog = Gtk.MessageDialog( + title="An error has occured.", + transient_for=self, + modal=True, + message_type=Gtk.MessageType.ERROR, + text=repr(e), + secondary_text=traceback.format_exc(), + buttons=Gtk.ButtonsType.OK, + ) + message_dialog.run() # type: ignore + message_dialog.destroy() + dialog.destroy() + + # Note: this won't run unless a new document object is put into place + # editing an existing one won't trigger it + def on_document_updated(self, recvobj, gparamstring): + document: Document = self.app.get_property("document") + self.header_bar.set_title(document.filename.split("/")[-1]) # type: ignore + self.header_bar.set_subtitle(document.filename) # type: ignore + + for i in document.document.pages(): # type: ignore + page = PdfPage(i) + box = Gtk.Box() + box.pack_start(page, True, True, 20) + self.pdf_list_box.add(box) # type: ignore + + @Gtk.Template.Callback() + def on_open_button_small_clicked(self, widget, **kwargs): + pass + + +class Application(Gtk.Application): + document = GObject.Property(type=GObject.TYPE_PYOBJECT, flags=GObject.ParamFlags.READWRITE) + + def __init__(self, *args, **kwargs): + super().__init__( + *args, + application_id="zone.lunareclipse.pdf_table_extractor", + flags=Gio.ApplicationFlags.FLAGS_NONE, + #flags=Gio.ApplicationFlags.HANDLES_COMMAND_LINE, # TODO + **kwargs + ) + self.window = None + + def do_activate(self): + self.window = self.window or MainWindow(application=self) + self.window.show_all() # type: ignore + + +if __name__ == "__main__": + app = Application() + app.run(sys.argv) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..60604c3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.poetry] +name = "pdf_table_extractor" +version = "0.1.0" +description = "Tool for extracting tabular data from PDFs" +authors = ["LunarEclipse "] +license = "GPL-3.0-only" +readme = "README.md" + +[tool.poetry.dependencies] +python = ">=3.11,<3.13" +pygobject = "^3.48.2" +tabula-py = "^2.9.0" +PyMuPDF = "^1.24.0" +#pdfplumber = "^0.11.0" +#PySide6 = "^6.6.3" # qt + +[tool.poetry.group.dev.dependencies] +pygobject-stubs = "^2.11.0" # this is actually broken lol + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api"