Comparar texto

Encuentra la diferencia entre dos archivos de texto

Editor live

Ocultar sin cambios

Sin ajuste de línea

Vista

Nivel de detalle

Resaltado de sintaxis

Diffchecker Desktop La forma más segura de usar Diffchecker. ¡Obtén la app de Diffchecker Desktop: tus diffs nunca salen de tu computadora!Obtener Desktop

camelot/handlers.py

Creado hace 5 añosEl diff nunca expira

42 eliminaciones

Líneas
Total
Eliminado

Caracteres
Total
Eliminado

Para continuar usando esta función, actualice a Diffchecker Pro Ver precios

169 líneas

77 adiciones

Líneas
Total
Añadido

Caracteres
Total
Añadido

Para continuar usando esta función, actualice a Diffchecker Pro Ver precios

181 líneas

# -*- coding: utf-8 -*-

import os

import sys

from PyPDF2 import PdfFileReader, PdfFileWriter

import fitz

from .core import TableList

from .parsers import Stream, Lattice

from .utils import (

TemporaryDirectory,

get_page_layout,

get_text_objects,

get_rotation,

is_url,

download_url,

)

class PDFHandler(object):

"""Handles all operations like temp directory creation, splitting

file into single page PDFs, parsing each PDF and then removing the

temp directory.

Parameters

----------

filepath : str

Filepath or URL of the PDF file.

pages : str, optional (default: '1')

Comma-separated page numbers.

Example: '1,3,4' or '1,4-end' or 'all'.

password : str, optional (default: None)

Password for decryption.

"""

def __init__(self, filepath, pages="1", password=None):

if is_url(filepath):

filepath = download_url(filepath)

self.filepath = filepath

if not filepath.lower().endswith(".pdf"):

raise NotImplementedError("File format not supported")

if password is None:

self.password = ""

else:

self.password = password

if sys.version_info[0] < 3:

self.password = self.password.encode("ascii")

self.pages = self._get_pages(self.filepath, pages)

def _get_pages(self, filepath, pages):

"""Converts pages string to list of ints.

Parameters

----------

filepath : str

Filepath or URL of the PDF file.

pages : str, optional (default: '1')

Comma-separated page numbers.

Example: '1,3,4' or '1,4-end' or 'all'.

Returns

-------

P : list

List of int page numbers.

"""

page_numbers = []

if pages == "1":

page_numbers.append({"start": 1, "end": 1})

else:

instream = open(filepath, "rb")

with fitz.open(filepath) as infile:

infile = PdfFileReader(instream, strict=False)

if infile.needsPass:

if infile.isEncrypted:

infile.authenticate(self.password)

infile.decrypt(self.password)

if pages == "all":

page_numbers.append({"start": 1, "end": infile.pageCount})

page_numbers.append({"start": 1, "end": infile.getNumPages()})

else:

for r in pages.split(","):

if "-" in r:

a, b = r.split("-")

if b == "end":

b = infile.pageCount

b = infile.getNumPages()

page_numbers.append(

page_numbers.append({"start": int(a), "end": int(b)})

{"start": int(a), "end": int(b)})

else:

page_numbers.append({"start": int(r), "end": int(r)})

page_numbers.append(

instream.close()

{"start": int(r), "end": int(r)})

P = []

for p in page_numbers:

P.extend(range(p["start"], p["end"] + 1))

return sorted(set(P))

def _save_page(self, filepath, page, temp):

"""Saves specified page from PDF into a temporary directory.

Parameters

----------

filepath : str

Filepath or URL of the PDF file.

page : int

Page number.

temp : str

Tmp directory.

"""

with open(filepath, "rb") as fileobj:

with fitz.open(filepath) as infile:

infile = PdfFileReader(fileobj, strict=False)

if infile.needsPass:

if infile.isEncrypted:

infile.authenticate(self.password)

infile.decrypt(self.password)

fpath = os.path.join(temp, f"page-{page}.pdf")

froot, fext = os.path.splitext(fpath)

p = infile.getPage(page - 1)

p = infile[page - 1]

outfile = PdfFileWriter()

p.setRotation(0)

outfile.addPage(p)

outfile = fitz.open()

with open(fpath, "wb") as f:

outpage = outfile.newPage(-1, width=p.rect.width,

outfile.write(f)

height=p.rect.height)

outpage.showPDFpage(outpage.rect, infile, pno=page-1)

outfile.save(fpath)

layout, dim = get_page_layout(fpath)

# fix rotated PDF

chars = get_text_objects(layout, ltype="char")

horizontal_text = get_text_objects(layout, ltype="horizontal_text")

vertical_text = get_text_objects(layout, ltype="vertical_text")

rotation = get_rotation(chars, horizontal_text, vertical_text)

if rotation != "":

fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])

fpath_new = "".join(

[froot.replace("page", "p"), "_rotated", fext])

os.rename(fpath, fpath_new)

instream = open(fpath_new, "rb")

infile = fitz.open(fpath_new)

infile = PdfFileReader(instream, strict=False)

if infile.needsPass:

if infile.isEncrypted:

infile.authenticate(self.password)

infile.decrypt(self.password)

outfile = fitz.open()

outfile = PdfFileWriter()

p = infile[0]

p = infile.getPage(0)

outpage = outfile.newPage(-1, width=p.rect.width,

height=p.rect.height)

outpage.showPDFpage(outpage.rect, infile, pno=0)

if rotation == "anticlockwise":

p.rotateClockwise(90)

outpage.setRotation((p.rotation + 90) % 360)

elif rotation == "clockwise":

p.rotateCounterClockwise(90)

outpage.setRotation((p.rotation + 270) % 360)

outfile.addPage(p)

with open(fpath, "wb") as f:

outfile.save(fpath)

outfile.write(f)

instream.close()

def parse(

self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs

"""Extracts tables by calling parser.get_tables on all single

page PDFs.

Parameters

----------

flavor : str (default: 'lattice')

The parsing method to use ('lattice' or 'stream').

Lattice is used by default.

suppress_stdout : str (default: False)

Suppress logs and warnings.

layout_kwargs : dict, optional (default: {})

A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.

kwargs : dict

See camelot.read_pdf kwargs.

Returns

-------

tables : camelot.core.TableList

List of tables found in PDF.

"""

tables = []

with TemporaryDirectory() as tempdir:

for p in self.pages:

try:

self._save_page(self.filepath, p, tempdir)

for p in self.pages:

pages = [

self._save_page(self.filepath, p, tempdir)

os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages

pages = [

]

os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages

parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)

]

for p in pages:

parser = Lattice(

t = parser.extract_tables(

**kwargs) if flavor == "lattice" else Stream(**kwargs)

p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs

for p in pages:

)

t = parser.extract_tables(

tables.extend(t)

p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs

)

tables.extend(t)

except ValueError as err:

if str(err) == "document closed or encrypted":

raise ValueError("file has not been decrypted") from err

raise

return TableList(sorted(tables))

Diferencias guardadas

Texto original

Abrir archivo

# -*- coding: utf-8 -*-

import os
import sys

from PyPDF2 import PdfFileReader, PdfFileWriter

from .core import TableList
from .parsers import Stream, Lattice
from .utils import (
    TemporaryDirectory,
    get_page_layout,
    get_text_objects,
    get_rotation,
    is_url,
    download_url,
)

class PDFHandler(object):
    """Handles all operations like temp directory creation, splitting
    file into single page PDFs, parsing each PDF and then removing the
    temp directory.
    Parameters
    ----------
    filepath : str
        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
    """

def __init__(self, filepath, pages="1", password=None):
        if is_url(filepath):
            filepath = download_url(filepath)
        self.filepath = filepath
        if not filepath.lower().endswith(".pdf"):
            raise NotImplementedError("File format not supported")

if password is None:
            self.password = ""
        else:
            self.password = password
            if sys.version_info[0] < 3:
                self.password = self.password.encode("ascii")
        self.pages = self._get_pages(self.filepath, pages)

def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
            Example: '1,3,4' or '1,4-end' or 'all'.
        Returns
        -------
        P : list
            List of int page numbers.
        """
        page_numbers = []
        if pages == "1":
            page_numbers.append({"start": 1, "end": 1})
        else:
            instream = open(filepath, "rb")
            infile = PdfFileReader(instream, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            if pages == "all":
                page_numbers.append({"start": 1, "end": infile.getNumPages()})
            else:
                for r in pages.split(","):
                    if "-" in r:
                        a, b = r.split("-")
                        if b == "end":
                            b = infile.getNumPages()
                        page_numbers.append({"start": int(a), "end": int(b)})
                    else:
                        page_numbers.append({"start": int(r), "end": int(r)})
            instream.close()
        P = []
        for p in page_numbers:
            P.extend(range(p["start"], p["end"] + 1))
        return sorted(set(P))

def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.
        """
        with open(filepath, "rb") as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            fpath = os.path.join(temp, f"page-{page}.pdf")
            froot, fext = os.path.splitext(fpath)
            p = infile.getPage(page - 1)
            outfile = PdfFileWriter()
            outfile.addPage(p)
            with open(fpath, "wb") as f:
                outfile.write(f)
            layout, dim = get_page_layout(fpath)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
            vertical_text = get_text_objects(layout, ltype="vertical_text")
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
                fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
                os.rename(fpath, fpath_new)
                instream = open(fpath_new, "rb")
                infile = PdfFileReader(instream, strict=False)
                if infile.isEncrypted:
                    infile.decrypt(self.password)
                outfile = PdfFileWriter()
                p = infile.getPage(0)
                if rotation == "anticlockwise":
                    p.rotateClockwise(90)
                elif rotation == "clockwise":
                    p.rotateCounterClockwise(90)
                outfile.addPage(p)
                with open(fpath, "wb") as f:
                    outfile.write(f)
                instream.close()

def parse(
        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
        Parameters
        ----------
        flavor : str (default: 'lattice')
            The parsing method to use ('lattice' or 'stream').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
            See camelot.read_pdf kwargs.
        Returns
        -------
        tables : camelot.core.TableList
            List of tables found in PDF.
        """
        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
                self._save_page(self.filepath, p, tempdir)
            pages = [
                os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
            ]
            parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
            for p in pages:
                t = parser.extract_tables(
                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
                )
                tables.extend(t)
        return TableList(sorted(tables))

Texto modificado

Abrir archivo

# -*- coding: utf-8 -*-

import os
import sys

import fitz

def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
            Example: '1,3,4' or '1,4-end' or 'all'.
        Returns
        -------
        P : list
            List of int page numbers.
        """
        page_numbers = []
        if pages == "1":
            page_numbers.append({"start": 1, "end": 1})
        else:
            with fitz.open(filepath) as infile:
                if infile.needsPass:
                    infile.authenticate(self.password)
                if pages == "all":
                    page_numbers.append({"start": 1, "end": infile.pageCount})
                else:
                    for r in pages.split(","):
                        if "-" in r:
                            a, b = r.split("-")
                            if b == "end":
                                b = infile.pageCount
                            page_numbers.append(
                                {"start": int(a), "end": int(b)})
                        else:
                            page_numbers.append(
                                {"start": int(r), "end": int(r)})
        P = []
        for p in page_numbers:
            P.extend(range(p["start"], p["end"] + 1))
        return sorted(set(P))

def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.
        """
        with fitz.open(filepath) as infile:
            if infile.needsPass:
                infile.authenticate(self.password)
            fpath = os.path.join(temp, f"page-{page}.pdf")
            froot, fext = os.path.splitext(fpath)
            p = infile[page - 1]
            p.setRotation(0)
            outfile = fitz.open()
            outpage = outfile.newPage(-1, width=p.rect.width,
                                      height=p.rect.height)
            outpage.showPDFpage(outpage.rect, infile, pno=page-1)
            outfile.save(fpath)

layout, dim = get_page_layout(fpath)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
            vertical_text = get_text_objects(layout, ltype="vertical_text")
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
                fpath_new = "".join(
                    [froot.replace("page", "p"), "_rotated", fext])
                os.rename(fpath, fpath_new)
                infile = fitz.open(fpath_new)
                if infile.needsPass:
                    infile.authenticate(self.password)
                outfile = fitz.open()
                p = infile[0]

outpage = outfile.newPage(-1, width=p.rect.width,
                                          height=p.rect.height)
                outpage.showPDFpage(outpage.rect, infile, pno=0)
                if rotation == "anticlockwise":
                    outpage.setRotation((p.rotation + 90) % 360)
                elif rotation == "clockwise":
                    outpage.setRotation((p.rotation + 270) % 360)

outfile.save(fpath)

def parse(
        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
        Parameters
        ----------
        flavor : str (default: 'lattice')
            The parsing method to use ('lattice' or 'stream').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
            See camelot.read_pdf kwargs.
        Returns
        -------
        tables : camelot.core.TableList
            List of tables found in PDF.
        """
        tables = []
        with TemporaryDirectory() as tempdir:
            try:
                for p in self.pages:
                    self._save_page(self.filepath, p, tempdir)
                pages = [
                    os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
                ]
                parser = Lattice(
                    **kwargs) if flavor == "lattice" else Stream(**kwargs)
                for p in pages:
                    t = parser.extract_tables(
                        p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
                    )
                    tables.extend(t)
            except ValueError as err:
                if str(err) == "document closed or encrypted":
                    raise ValueError("file has not been decrypted") from err

raise

return TableList(sorted(tables))