Diff
checker
Texto
Texto
Imagens
Documentos
Excel
Pastas
Legal
Enterprise
Aplicativo para desktop
Preços
Fazer login
Baixar o Diffchecker Desktop
Comparar texto
Encontre a diferença entre dois arquivos de texto
Ferramentas
Histórico
Editor live
Recolher inalteradas
Sem quebra de linha
Layout
Dividido
Unificado
Nível de detalhe
Inteligente
Palavra
Caractere
Realce de sintaxe
Escolher sintaxe
Ignorar
Transformar texto
Ir à primeira mudança
Editar entrada
Diffchecker Desktop
A maneira mais segura de usar o Diffchecker. Obtenha o aplicativo Diffchecker Desktop: seus diffs nunca saem do seu computador!
Obter Desktop
camelot/handlers.py
Criado
há 5 anos
O diff nunca expira
Limpar
Exportar
Compartilhar
Explicar
42 remoções
Linhas
Total
Removido
Caracteres
Total
Removido
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
169 linhas
Copiar tudo
77 adições
Linhas
Total
Adicionado
Caracteres
Total
Adicionado
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
181 linhas
Copiar tudo
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import os
import sys
import sys
Copiar
Copiado
Copiar
Copiado
from PyPDF2
import
PdfFileReader, PdfFileWriter
import
fitz
from .core import TableList
from .core import TableList
from .parsers import Stream, Lattice
from .parsers import Stream, Lattice
from .utils import (
from .utils import (
TemporaryDirectory,
TemporaryDirectory,
get_page_layout,
get_page_layout,
get_text_objects,
get_text_objects,
get_rotation,
get_rotation,
is_url,
is_url,
download_url,
download_url,
)
)
class PDFHandler(object):
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
file into single page PDFs, parsing each PDF and then removing the
temp directory.
temp directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
password : str, optional (default: None)
Password for decryption.
Password for decryption.
"""
"""
def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
if is_url(filepath):
filepath = download_url(filepath)
filepath = download_url(filepath)
self.filepath = filepath
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
raise NotImplementedError("File format not supported")
if password is None:
if password is None:
self.password = ""
self.password = ""
else:
else:
self.password = password
self.password = password
if sys.version_info[0] < 3:
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
"""Converts pages string to list of ints.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
Returns
-------
-------
P : list
P : list
List of int page numbers.
List of int page numbers.
"""
"""
page_numbers = []
page_numbers = []
if pages == "1":
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
page_numbers.append({"start": 1, "end": 1})
else:
else:
Copiar
Copiado
Copiar
Copiado
instream =
open(filepath
, "rb")
with fitz.
open(filepath
) as
infile
:
infile
= PdfFileReader(instream, strict=False)
if infile.
needsPass:
if infile.
isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
if pages == "all":
if pages == "all":
page_numbers.append({"start": 1, "end": infile.
pageCount
})
page_numbers.append({"start": 1, "end": infile.
getNumPages()
})
else:
else:
for r in pages.split(","):
for r in pages.split(","):
if "-" in r:
if "-" in r:
a, b = r.split("-")
a, b = r.split("-")
if b == "end":
if b == "end":
b = infile.
pageCount
b = infile.
getNumPages()
page_numbers.append(
page_numbers.append(
{"start": int(a), "end": int(b)})
{"start": int(a), "end": int(b)})
else:
else:
page_numbers.append(
{"start": int(r), "end": int(r)})
page_numbers.append(
instream.close()
{"start": int(r), "end": int(r)})
P = []
P = []
for p in page_numbers:
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
"""Saves specified page from PDF into a temporary directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
page : int
page : int
Page number.
Page number.
temp : str
temp : str
Tmp directory.
Tmp directory.
"""
"""
Copiar
Copiado
Copiar
Copiado
with
open(filepath, "rb") as fileobj:
with
fitz.open(filepath) as infile:
infile = PdfFileReader(fileobj, strict=False)
if infile.needsPass:
if infile.isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
froot, fext = os.path.splitext(fpath)
Copiar
Copiado
Copiar
Copiado
p = infile
.getPage(
page - 1
)
p = infile
[
page - 1
]
outfile =
PdfFileWriter
()
p.setRotation(0
)
outfile.
addPage(p)
outfile =
fitz.open
()
with open(fpath, "wb") as f:
outpage =
outfile.
newPage(-1, width=p.rect.width,
outfile.
write(f)
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=page-1)
outfile.
save(fpath)
layout, dim = get_page_layout(fpath)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
if rotation != "":
Copiar
Copiado
Copiar
Copiado
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
os.rename(fpath, fpath_new)
Copiar
Copiado
Copiar
Copiado
instream = open(fpath_new, "rb")
infile =
fitz.open(fpath_new
)
infile =
PdfFileReader(instream, strict=False
)
if infile.
needsPass
:
if infile.
isEncrypted
:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
outfile =
fitz.open
()
outfile =
PdfFileWriter
()
p = infile
[0]
p = infile
.getPage(0)
outpage = outfile.newPage(-1, width=p.rect.width,
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=0)
if rotation == "anticlockwise":
if rotation == "anticlockwise":
Copiar
Copiado
Copiar
Copiado
p.rotateClockwise(
90)
outpage.setRotation((p.rotation +
90)
% 360)
elif rotation == "clockwise":
elif rotation == "clockwise":
Copiar
Copiado
Copiar
Copiado
p.rotateCounterClockwise(90)
outpage.setRotation((p.rotation + 270) % 360)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.save(fpath)
outfile.write(f)
instream.close()
def parse(
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
):
"""Extracts tables by calling parser.get_tables on all single
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
page PDFs.
Parameters
Parameters
----------
----------
flavor : str (default: 'lattice')
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
Lattice is used by default.
suppress_stdout : str (default: False)
suppress_stdout : str (default: False)
Suppress logs and warnings.
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
kwargs : dict
See camelot.read_pdf kwargs.
See camelot.read_pdf kwargs.
Returns
Returns
-------
-------
tables : camelot.core.TableList
tables : camelot.core.TableList
List of tables found in PDF.
List of tables found in PDF.
"""
"""
tables = []
tables = []
with TemporaryDirectory() as tempdir:
with TemporaryDirectory() as tempdir:
Copiar
Copiado
Copiar
Copiado
for p in self.pages:
try:
self._save_page(self.filepath, p, tempdir)
for p in self.pages:
pages = [
self._save_page(self.filepath, p, tempdir)
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
pages = [
]
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
parser = Lattice(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
]
for p in pages:
parser = Lattice(
t = parser.extract_tables(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
for p in pages:
)
t = parser.extract_tables(
tables.extend(t)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
except ValueError as err:
if str(err) == "document closed or encrypted":
raise ValueError("file has not been decrypted") from err
raise
return TableList(sorted(tables))
return TableList(sorted(tables))
Diferenças salvas
Texto original
Abrir arquivo
# -*- coding: utf-8 -*- import os import sys from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: instream = open(filepath, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.getNumPages() page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) instream.close() P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables))
Texto alterado
Abrir arquivo
# -*- coding: utf-8 -*- import os import sys import fitz from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.pageCount}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.pageCount page_numbers.append( {"start": int(a), "end": int(b)}) else: page_numbers.append( {"start": int(r), "end": int(r)}) P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile[page - 1] p.setRotation(0) outfile = fitz.open() outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=page-1) outfile.save(fpath) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = fitz.open(fpath_new) if infile.needsPass: infile.authenticate(self.password) outfile = fitz.open() p = infile[0] outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=0) if rotation == "anticlockwise": outpage.setRotation((p.rotation + 90) % 360) elif rotation == "clockwise": outpage.setRotation((p.rotation + 270) % 360) outfile.save(fpath) def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: try: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice( **kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) except ValueError as err: if str(err) == "document closed or encrypted": raise ValueError("file has not been decrypted") from err raise return TableList(sorted(tables))
Encontrar Diferença