Diff
checker
文本
文本
圖像
文檔
Excel
文件夾
Legal
Enterprise
桌面版
定價
登入
下載 Diffchecker 桌面版
比較文本
尋找兩個文字檔案之間的差異
工具
歷史
即時編輯器
摺疊未變更行
關閉換行
檢視
拆分
統一
比對精度
智能
單詞
字符
語法突出顯示
選擇語法
忽略
文字轉換
前往第一個差異
編輯輸入
Diffchecker Desktop
執行Diffchecker最安全的方式。取得Diffchecker桌面應用程式:您的差異永遠不會離開您的電腦!
取得桌面版
camelot/handlers.py
建立於
5 年前
差異永不過期
清除
匯出
分享
解釋
42 刪除
行
總計
刪除
字符
總計
刪除
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
169 行
全部複製
77 新增
行
總計
新增
字符
總計
新增
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
181 行
全部複製
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import os
import sys
import sys
複製
已複製
複製
已複製
from PyPDF2
import
PdfFileReader, PdfFileWriter
import
fitz
from .core import TableList
from .core import TableList
from .parsers import Stream, Lattice
from .parsers import Stream, Lattice
from .utils import (
from .utils import (
TemporaryDirectory,
TemporaryDirectory,
get_page_layout,
get_page_layout,
get_text_objects,
get_text_objects,
get_rotation,
get_rotation,
is_url,
is_url,
download_url,
download_url,
)
)
class PDFHandler(object):
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
file into single page PDFs, parsing each PDF and then removing the
temp directory.
temp directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
password : str, optional (default: None)
Password for decryption.
Password for decryption.
"""
"""
def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
if is_url(filepath):
filepath = download_url(filepath)
filepath = download_url(filepath)
self.filepath = filepath
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
raise NotImplementedError("File format not supported")
if password is None:
if password is None:
self.password = ""
self.password = ""
else:
else:
self.password = password
self.password = password
if sys.version_info[0] < 3:
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
"""Converts pages string to list of ints.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
Returns
-------
-------
P : list
P : list
List of int page numbers.
List of int page numbers.
"""
"""
page_numbers = []
page_numbers = []
if pages == "1":
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
page_numbers.append({"start": 1, "end": 1})
else:
else:
複製
已複製
複製
已複製
instream =
open(filepath
, "rb")
with fitz.
open(filepath
) as
infile
:
infile
= PdfFileReader(instream, strict=False)
if infile.
needsPass:
if infile.
isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
if pages == "all":
if pages == "all":
page_numbers.append({"start": 1, "end": infile.
pageCount
})
page_numbers.append({"start": 1, "end": infile.
getNumPages()
})
else:
else:
for r in pages.split(","):
for r in pages.split(","):
if "-" in r:
if "-" in r:
a, b = r.split("-")
a, b = r.split("-")
if b == "end":
if b == "end":
b = infile.
pageCount
b = infile.
getNumPages()
page_numbers.append(
page_numbers.append(
{"start": int(a), "end": int(b)})
{"start": int(a), "end": int(b)})
else:
else:
page_numbers.append(
{"start": int(r), "end": int(r)})
page_numbers.append(
instream.close()
{"start": int(r), "end": int(r)})
P = []
P = []
for p in page_numbers:
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
"""Saves specified page from PDF into a temporary directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
page : int
page : int
Page number.
Page number.
temp : str
temp : str
Tmp directory.
Tmp directory.
"""
"""
複製
已複製
複製
已複製
with
open(filepath, "rb") as fileobj:
with
fitz.open(filepath) as infile:
infile = PdfFileReader(fileobj, strict=False)
if infile.needsPass:
if infile.isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
froot, fext = os.path.splitext(fpath)
複製
已複製
複製
已複製
p = infile
.getPage(
page - 1
)
p = infile
[
page - 1
]
outfile =
PdfFileWriter
()
p.setRotation(0
)
outfile.
addPage(p)
outfile =
fitz.open
()
with open(fpath, "wb") as f:
outpage =
outfile.
newPage(-1, width=p.rect.width,
outfile.
write(f)
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=page-1)
outfile.
save(fpath)
layout, dim = get_page_layout(fpath)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
if rotation != "":
複製
已複製
複製
已複製
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
os.rename(fpath, fpath_new)
複製
已複製
複製
已複製
instream = open(fpath_new, "rb")
infile =
fitz.open(fpath_new
)
infile =
PdfFileReader(instream, strict=False
)
if infile.
needsPass
:
if infile.
isEncrypted
:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
outfile =
fitz.open
()
outfile =
PdfFileWriter
()
p = infile
[0]
p = infile
.getPage(0)
outpage = outfile.newPage(-1, width=p.rect.width,
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=0)
if rotation == "anticlockwise":
if rotation == "anticlockwise":
複製
已複製
複製
已複製
p.rotateClockwise(
90)
outpage.setRotation((p.rotation +
90)
% 360)
elif rotation == "clockwise":
elif rotation == "clockwise":
複製
已複製
複製
已複製
p.rotateCounterClockwise(90)
outpage.setRotation((p.rotation + 270) % 360)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.save(fpath)
outfile.write(f)
instream.close()
def parse(
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
):
"""Extracts tables by calling parser.get_tables on all single
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
page PDFs.
Parameters
Parameters
----------
----------
flavor : str (default: 'lattice')
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
Lattice is used by default.
suppress_stdout : str (default: False)
suppress_stdout : str (default: False)
Suppress logs and warnings.
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
kwargs : dict
See camelot.read_pdf kwargs.
See camelot.read_pdf kwargs.
Returns
Returns
-------
-------
tables : camelot.core.TableList
tables : camelot.core.TableList
List of tables found in PDF.
List of tables found in PDF.
"""
"""
tables = []
tables = []
with TemporaryDirectory() as tempdir:
with TemporaryDirectory() as tempdir:
複製
已複製
複製
已複製
for p in self.pages:
try:
self._save_page(self.filepath, p, tempdir)
for p in self.pages:
pages = [
self._save_page(self.filepath, p, tempdir)
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
pages = [
]
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
parser = Lattice(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
]
for p in pages:
parser = Lattice(
t = parser.extract_tables(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
for p in pages:
)
t = parser.extract_tables(
tables.extend(t)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
except ValueError as err:
if str(err) == "document closed or encrypted":
raise ValueError("file has not been decrypted") from err
raise
return TableList(sorted(tables))
return TableList(sorted(tables))
已保存差異
原始文本
開啟檔案
# -*- coding: utf-8 -*- import os import sys from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: instream = open(filepath, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.getNumPages() page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) instream.close() P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables))
更改後文本
開啟檔案
# -*- coding: utf-8 -*- import os import sys import fitz from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.pageCount}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.pageCount page_numbers.append( {"start": int(a), "end": int(b)}) else: page_numbers.append( {"start": int(r), "end": int(r)}) P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile[page - 1] p.setRotation(0) outfile = fitz.open() outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=page-1) outfile.save(fpath) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = fitz.open(fpath_new) if infile.needsPass: infile.authenticate(self.password) outfile = fitz.open() p = infile[0] outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=0) if rotation == "anticlockwise": outpage.setRotation((p.rotation + 90) % 360) elif rotation == "clockwise": outpage.setRotation((p.rotation + 270) % 360) outfile.save(fpath) def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: try: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice( **kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) except ValueError as err: if str(err) == "document closed or encrypted": raise ValueError("file has not been decrypted") from err raise return TableList(sorted(tables))
尋找差異