Diff
checker
Testo
Testo
Immagini
Documenti
Excel
Cartelle
Legal
Enterprise
Applicazione per desktop
Prezzi
Accedi
Scarica Diffchecker Desktop
Confronta il testo
Trova la differenza tra due file di testo
Strumenti
Cronologia
Editor live
Comprimi invariate
Senza a capo
Layout
Diviso
Unificato
Livello di dettaglio
Intelligente
Parola
Carattere
Evidenziazione sintassi
Scegli sintassi
Ignora
Trasforma testo
Vai alla prima modifica
Modifica input
Diffchecker Desktop
Il modo più sicuro per usare Diffchecker. Ottieni l'app Diffchecker Desktop: i tuoi diff non lasciano mai il tuo computer!
Ottieni Desktop
camelot/handlers.py
Creato
5 anni fa
Il diff non scade mai
Eliminare
Esporta
Condividere
Spiegare
42 rimozioni
Linee
Totale
Rimosso
Caratteri
Totale
Rimosso
Per continuare a utilizzare questa funzione, aggiorna a
Diff
checker
Pro
Visualizza prezzi
169 linee
Copia tutti
77 aggiunte
Linee
Totale
Aggiunto
Caratteri
Totale
Aggiunto
Per continuare a utilizzare questa funzione, aggiorna a
Diff
checker
Pro
Visualizza prezzi
181 linee
Copia tutti
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import os
import sys
import sys
Copia
Copiato
Copia
Copiato
from PyPDF2
import
PdfFileReader, PdfFileWriter
import
fitz
from .core import TableList
from .core import TableList
from .parsers import Stream, Lattice
from .parsers import Stream, Lattice
from .utils import (
from .utils import (
TemporaryDirectory,
TemporaryDirectory,
get_page_layout,
get_page_layout,
get_text_objects,
get_text_objects,
get_rotation,
get_rotation,
is_url,
is_url,
download_url,
download_url,
)
)
class PDFHandler(object):
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
file into single page PDFs, parsing each PDF and then removing the
temp directory.
temp directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
password : str, optional (default: None)
Password for decryption.
Password for decryption.
"""
"""
def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
if is_url(filepath):
filepath = download_url(filepath)
filepath = download_url(filepath)
self.filepath = filepath
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
raise NotImplementedError("File format not supported")
if password is None:
if password is None:
self.password = ""
self.password = ""
else:
else:
self.password = password
self.password = password
if sys.version_info[0] < 3:
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
"""Converts pages string to list of ints.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
Returns
-------
-------
P : list
P : list
List of int page numbers.
List of int page numbers.
"""
"""
page_numbers = []
page_numbers = []
if pages == "1":
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
page_numbers.append({"start": 1, "end": 1})
else:
else:
Copia
Copiato
Copia
Copiato
instream =
open(filepath
, "rb")
with fitz.
open(filepath
) as
infile
:
infile
= PdfFileReader(instream, strict=False)
if infile.
needsPass:
if infile.
isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
if pages == "all":
if pages == "all":
page_numbers.append({"start": 1, "end": infile.
pageCount
})
page_numbers.append({"start": 1, "end": infile.
getNumPages()
})
else:
else:
for r in pages.split(","):
for r in pages.split(","):
if "-" in r:
if "-" in r:
a, b = r.split("-")
a, b = r.split("-")
if b == "end":
if b == "end":
b = infile.
pageCount
b = infile.
getNumPages()
page_numbers.append(
page_numbers.append(
{"start": int(a), "end": int(b)})
{"start": int(a), "end": int(b)})
else:
else:
page_numbers.append(
{"start": int(r), "end": int(r)})
page_numbers.append(
instream.close()
{"start": int(r), "end": int(r)})
P = []
P = []
for p in page_numbers:
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
"""Saves specified page from PDF into a temporary directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
page : int
page : int
Page number.
Page number.
temp : str
temp : str
Tmp directory.
Tmp directory.
"""
"""
Copia
Copiato
Copia
Copiato
with
open(filepath, "rb") as fileobj:
with
fitz.open(filepath) as infile:
infile = PdfFileReader(fileobj, strict=False)
if infile.needsPass:
if infile.isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
froot, fext = os.path.splitext(fpath)
Copia
Copiato
Copia
Copiato
p = infile
.getPage(
page - 1
)
p = infile
[
page - 1
]
outfile =
PdfFileWriter
()
p.setRotation(0
)
outfile.
addPage(p)
outfile =
fitz.open
()
with open(fpath, "wb") as f:
outpage =
outfile.
newPage(-1, width=p.rect.width,
outfile.
write(f)
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=page-1)
outfile.
save(fpath)
layout, dim = get_page_layout(fpath)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
if rotation != "":
Copia
Copiato
Copia
Copiato
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
os.rename(fpath, fpath_new)
Copia
Copiato
Copia
Copiato
instream = open(fpath_new, "rb")
infile =
fitz.open(fpath_new
)
infile =
PdfFileReader(instream, strict=False
)
if infile.
needsPass
:
if infile.
isEncrypted
:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
outfile =
fitz.open
()
outfile =
PdfFileWriter
()
p = infile
[0]
p = infile
.getPage(0)
outpage = outfile.newPage(-1, width=p.rect.width,
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=0)
if rotation == "anticlockwise":
if rotation == "anticlockwise":
Copia
Copiato
Copia
Copiato
p.rotateClockwise(
90)
outpage.setRotation((p.rotation +
90)
% 360)
elif rotation == "clockwise":
elif rotation == "clockwise":
Copia
Copiato
Copia
Copiato
p.rotateCounterClockwise(90)
outpage.setRotation((p.rotation + 270) % 360)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.save(fpath)
outfile.write(f)
instream.close()
def parse(
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
):
"""Extracts tables by calling parser.get_tables on all single
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
page PDFs.
Parameters
Parameters
----------
----------
flavor : str (default: 'lattice')
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
Lattice is used by default.
suppress_stdout : str (default: False)
suppress_stdout : str (default: False)
Suppress logs and warnings.
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
kwargs : dict
See camelot.read_pdf kwargs.
See camelot.read_pdf kwargs.
Returns
Returns
-------
-------
tables : camelot.core.TableList
tables : camelot.core.TableList
List of tables found in PDF.
List of tables found in PDF.
"""
"""
tables = []
tables = []
with TemporaryDirectory() as tempdir:
with TemporaryDirectory() as tempdir:
Copia
Copiato
Copia
Copiato
for p in self.pages:
try:
self._save_page(self.filepath, p, tempdir)
for p in self.pages:
pages = [
self._save_page(self.filepath, p, tempdir)
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
pages = [
]
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
parser = Lattice(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
]
for p in pages:
parser = Lattice(
t = parser.extract_tables(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
for p in pages:
)
t = parser.extract_tables(
tables.extend(t)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
except ValueError as err:
if str(err) == "document closed or encrypted":
raise ValueError("file has not been decrypted") from err
raise
return TableList(sorted(tables))
return TableList(sorted(tables))
Diff salvati
Testo originale
Apri file
# -*- coding: utf-8 -*- import os import sys from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: instream = open(filepath, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.getNumPages() page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) instream.close() P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables))
Testo modificato
Apri file
# -*- coding: utf-8 -*- import os import sys import fitz from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.pageCount}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.pageCount page_numbers.append( {"start": int(a), "end": int(b)}) else: page_numbers.append( {"start": int(r), "end": int(r)}) P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile[page - 1] p.setRotation(0) outfile = fitz.open() outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=page-1) outfile.save(fpath) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = fitz.open(fpath_new) if infile.needsPass: infile.authenticate(self.password) outfile = fitz.open() p = infile[0] outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=0) if rotation == "anticlockwise": outpage.setRotation((p.rotation + 90) % 360) elif rotation == "clockwise": outpage.setRotation((p.rotation + 270) % 360) outfile.save(fpath) def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: try: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice( **kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) except ValueError as err: if str(err) == "document closed or encrypted": raise ValueError("file has not been decrypted") from err raise return TableList(sorted(tables))
Trovare la differenza