Diff
checker
テキスト
テキスト
画像
ドキュメント
Excel
フォルダ
Legal
Enterprise
デスクトップ
料金
ログイン
Diffchecker デスクトップのダウンロード
テキスト比較
2 つのテキスト ファイルの違いを見つける
ツール
履歴
ライブエディター
未変更行を折りたたむ
折り返しなし
レイアウト
分割
統合
比較精度
スマート
単語
文字
シンタックスハイライト
構文を選択
無視
テキスト変換
最初の差分へ移動
入力を編集
Diffchecker Desktop
Diffcheckerを実行する最も安全な方法。Diffchecker Desktopアプリを入手:あなたの差分はコンピューターから出ることはありません!
Desktopを入手
camelot/handlers.py
作成日
5 年前
差分は期限切れになりません
クリア
エクスポート
共有
説明
42 削除
行
合計
削除
文字
合計
削除
この機能を引き続き使用するには、アップグレードしてください
Diff
checker
Pro
価格を見る
169 行
すべてコピー
77 追加
行
合計
追加
文字
合計
追加
この機能を引き続き使用するには、アップグレードしてください
Diff
checker
Pro
価格を見る
181 行
すべてコピー
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import os
import sys
import sys
コピー
コピー済み
コピー
コピー済み
from PyPDF2
import
PdfFileReader, PdfFileWriter
import
fitz
from .core import TableList
from .core import TableList
from .parsers import Stream, Lattice
from .parsers import Stream, Lattice
from .utils import (
from .utils import (
TemporaryDirectory,
TemporaryDirectory,
get_page_layout,
get_page_layout,
get_text_objects,
get_text_objects,
get_rotation,
get_rotation,
is_url,
is_url,
download_url,
download_url,
)
)
class PDFHandler(object):
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
file into single page PDFs, parsing each PDF and then removing the
temp directory.
temp directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
password : str, optional (default: None)
Password for decryption.
Password for decryption.
"""
"""
def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
if is_url(filepath):
filepath = download_url(filepath)
filepath = download_url(filepath)
self.filepath = filepath
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
raise NotImplementedError("File format not supported")
if password is None:
if password is None:
self.password = ""
self.password = ""
else:
else:
self.password = password
self.password = password
if sys.version_info[0] < 3:
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
"""Converts pages string to list of ints.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
Returns
-------
-------
P : list
P : list
List of int page numbers.
List of int page numbers.
"""
"""
page_numbers = []
page_numbers = []
if pages == "1":
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
page_numbers.append({"start": 1, "end": 1})
else:
else:
コピー
コピー済み
コピー
コピー済み
instream =
open(filepath
, "rb")
with fitz.
open(filepath
) as
infile
:
infile
= PdfFileReader(instream, strict=False)
if infile.
needsPass:
if infile.
isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
if pages == "all":
if pages == "all":
page_numbers.append({"start": 1, "end": infile.
pageCount
})
page_numbers.append({"start": 1, "end": infile.
getNumPages()
})
else:
else:
for r in pages.split(","):
for r in pages.split(","):
if "-" in r:
if "-" in r:
a, b = r.split("-")
a, b = r.split("-")
if b == "end":
if b == "end":
b = infile.
pageCount
b = infile.
getNumPages()
page_numbers.append(
page_numbers.append(
{"start": int(a), "end": int(b)})
{"start": int(a), "end": int(b)})
else:
else:
page_numbers.append(
{"start": int(r), "end": int(r)})
page_numbers.append(
instream.close()
{"start": int(r), "end": int(r)})
P = []
P = []
for p in page_numbers:
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
"""Saves specified page from PDF into a temporary directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
page : int
page : int
Page number.
Page number.
temp : str
temp : str
Tmp directory.
Tmp directory.
"""
"""
コピー
コピー済み
コピー
コピー済み
with
open(filepath, "rb") as fileobj:
with
fitz.open(filepath) as infile:
infile = PdfFileReader(fileobj, strict=False)
if infile.needsPass:
if infile.isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
froot, fext = os.path.splitext(fpath)
コピー
コピー済み
コピー
コピー済み
p = infile
.getPage(
page - 1
)
p = infile
[
page - 1
]
outfile =
PdfFileWriter
()
p.setRotation(0
)
outfile.
addPage(p)
outfile =
fitz.open
()
with open(fpath, "wb") as f:
outpage =
outfile.
newPage(-1, width=p.rect.width,
outfile.
write(f)
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=page-1)
outfile.
save(fpath)
layout, dim = get_page_layout(fpath)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
if rotation != "":
コピー
コピー済み
コピー
コピー済み
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
os.rename(fpath, fpath_new)
コピー
コピー済み
コピー
コピー済み
instream = open(fpath_new, "rb")
infile =
fitz.open(fpath_new
)
infile =
PdfFileReader(instream, strict=False
)
if infile.
needsPass
:
if infile.
isEncrypted
:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
outfile =
fitz.open
()
outfile =
PdfFileWriter
()
p = infile
[0]
p = infile
.getPage(0)
outpage = outfile.newPage(-1, width=p.rect.width,
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=0)
if rotation == "anticlockwise":
if rotation == "anticlockwise":
コピー
コピー済み
コピー
コピー済み
p.rotateClockwise(
90)
outpage.setRotation((p.rotation +
90)
% 360)
elif rotation == "clockwise":
elif rotation == "clockwise":
コピー
コピー済み
コピー
コピー済み
p.rotateCounterClockwise(90)
outpage.setRotation((p.rotation + 270) % 360)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.save(fpath)
outfile.write(f)
instream.close()
def parse(
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
):
"""Extracts tables by calling parser.get_tables on all single
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
page PDFs.
Parameters
Parameters
----------
----------
flavor : str (default: 'lattice')
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
Lattice is used by default.
suppress_stdout : str (default: False)
suppress_stdout : str (default: False)
Suppress logs and warnings.
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
kwargs : dict
See camelot.read_pdf kwargs.
See camelot.read_pdf kwargs.
Returns
Returns
-------
-------
tables : camelot.core.TableList
tables : camelot.core.TableList
List of tables found in PDF.
List of tables found in PDF.
"""
"""
tables = []
tables = []
with TemporaryDirectory() as tempdir:
with TemporaryDirectory() as tempdir:
コピー
コピー済み
コピー
コピー済み
for p in self.pages:
try:
self._save_page(self.filepath, p, tempdir)
for p in self.pages:
pages = [
self._save_page(self.filepath, p, tempdir)
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
pages = [
]
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
parser = Lattice(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
]
for p in pages:
parser = Lattice(
t = parser.extract_tables(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
for p in pages:
)
t = parser.extract_tables(
tables.extend(t)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
except ValueError as err:
if str(err) == "document closed or encrypted":
raise ValueError("file has not been decrypted") from err
raise
return TableList(sorted(tables))
return TableList(sorted(tables))
保存された差分
原文
ファイルを開く
# -*- coding: utf-8 -*- import os import sys from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: instream = open(filepath, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.getNumPages() page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) instream.close() P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables))
変更されたテキスト
ファイルを開く
# -*- coding: utf-8 -*- import os import sys import fitz from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.pageCount}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.pageCount page_numbers.append( {"start": int(a), "end": int(b)}) else: page_numbers.append( {"start": int(r), "end": int(r)}) P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile[page - 1] p.setRotation(0) outfile = fitz.open() outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=page-1) outfile.save(fpath) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = fitz.open(fpath_new) if infile.needsPass: infile.authenticate(self.password) outfile = fitz.open() p = infile[0] outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=0) if rotation == "anticlockwise": outpage.setRotation((p.rotation + 90) % 360) elif rotation == "clockwise": outpage.setRotation((p.rotation + 270) % 360) outfile.save(fpath) def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: try: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice( **kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) except ValueError as err: if str(err) == "document closed or encrypted": raise ValueError("file has not been decrypted") from err raise return TableList(sorted(tables))
違いを見つける