Comparar texto

Encuentra la diferencia entre dos archivos de texto

Diff en tiempo real

Diff unificado

Contraer líneas

Resaltar cambios

Resaltado de sintaxis

Herramientas

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

textractor.py

Created 6 months agoDiff never expires

Líneas
Total
Eliminado

Palabras
Total
Eliminado

Para continuar usando esta función, actualice a Diffchecker Pro Ver precios

563 líneas

Líneas
Total
Añadido

Palabras
Total
Añadido

Para continuar usando esta función, actualice a Diffchecker Pro Ver precios

542 líneas

"""

:class:`Textractor` is the main class associated with this package. It needs to be instantiated before using any of the functionalities

the package provides. The main use of this class is to make calls to the Textract API and create Python objects for all the

document entities that are returned in the JSON output of the API. The response received is implicitly parsed and a :class:`Document` type

object is returned containing all the document entities, their associated relationships and metadata.

The Textract API and Textractor method mapping is as below. Use these wrappers to make calls and parse the responses

in one step.

* (SYNC) DetectDocumentText : detect_document_text

* (SYNC) AnalyzeDocument : analyze_document

* (SYNC) AnalyzeID : analyze_id

* (SYNC) AnalyzeExpense : analyze_expense

* (ASYNC) StartDocumentTextDetection : start_document_text_detection

* (ASYNC) StartDocumentAnalysis : start_document_analysis

* (ASYNC) StartExpenseAnalysis : start_expense_analysis

"""

import io

import os

import boto3

import logging

import uuid

from PIL import Image

from copy import deepcopy

from typing import List, Union

from textractcaller import (

call_textract,

call_textract_analyzeid,

call_textract_expense,

OutputConfig,

Query,

QueriesConfig,

)

from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json

try:

from pdf2image import convert_from_bytes, convert_from_path

IS_PDF2IMAGE_INSTALLED = True

except ImportError:

IS_PDF2IMAGE_INSTALLED = False

logging.info("pdf2image is not installed, client-side PDF rasterizing is disabled")

from textractor.data.constants import (

TextractAPI,

TextractFeatures,

)

from textractor.entities.document import Document

from textractor.entities.lazy_document import LazyDocument

from textractor.parsers import response_parser

from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix

from textractor.utils.pdf_utils import rasterize_pdf

from textractor.exceptions import (

InputError,

RegionMismatchError,

IncorrectMethodException,

MissingDependencyException,

UnhandledCaseException,

UnsupportedDocumentException,

InvalidS3ObjectException,

)

logger = logging.getLogger(__name__)

try:

import pypdfium2

except:

import pdf2image

IS_PDF_RENDERING_ENABLED = True

except ImportError:

IS_PDF_RENDERING_ENABLED = False

logger.info("pypdfium2 and pdf2image are both not installed, client-side PDF rasterizing is disabled")

class Textractor:

"""

Initializes the customer credentials needed to make calls to Textract using boto3 package internally.

:param profile_name: Customer's profile name as set in the ~/.aws/config file. This profile typically contains this format.

:code:`[default]

region = us-west-2

output=json`

:type profile_name: str

:param region_name: If AWSCLI isn't setup, the user can pass region to let boto3 pick up credentials from the system.

:param region_name: str

:type profile_name: str, optional

:param kms_key_id: Customer's AWS KMS key (cryptographic key)

:type kms_key_id: str, optional

"""

def __init__(

self,

profile_name: str = None,

region_name: str = None,

kms_key_id: str = "",

self.profile_name = profile_name

self.region_name = region_name

self.kms_key_id = kms_key_id

if self.profile_name is not None:

self.session = boto3.session.Session(profile_name=self.profile_name)

elif self.region_name is not None:

self.session = boto3.session.Session(region_name=self.region_name)

elif os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"):

# We support both AWS_REGION and AWS_DEFAULT_REGION, with AWS_REGION having precedence.

self.region_name = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION")

self.session = boto3.session.Session(region_name=self.region_name)

else:

raise InputError(

"Unable to initiate Textractor. Either profile_name or region requires an input parameter."

)

if self.region_name is not None:

self.textract_client = self.session.client("textract", region_name=self.region_name)

self.textract_client = self.session.client(

"textract", region_name=self.region_name

)

else:

self.textract_client = self.session.client("textract")

self.s3_client = self.session.client("s3")

def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:

"""

Converts the every page in the document to an image. It supports pdfs and image formats that can be opened by

PIL package. Documents can be stored in the local computer or on an S3 Bucket.

:param filepath: filepath to the document stored locally or on an S3 bucket.

:type filepath: str, required

:return: Returns a list of PIL Images, one for each page of the document

:rtype: List[Image]

"""

images = []

if "s3://" in filepath:

edit_filepath = filepath.replace("s3://", "")

bucket = edit_filepath.split("/")[0]

key = edit_filepath[edit_filepath.index("/") + 1 :]

s3_client = (

boto3.session.Session(profile_name=self.profile_name).client("s3")

if self.profile_name is not None

else boto3.session.Session(region_name=self.region_name).client("s3")

)

file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()

if filepath.lower().endswith(".pdf"):

if IS_PDF2IMAGE_INSTALLED:

if IS_PDF_RENDERING_ENABLED:

images = convert_from_bytes(bytearray(file_obj))

images = rasterize_pdf(file_obj)

else:

raise MissingDependencyException(

"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."

)

else:

images = [Image.open(io.BytesIO(bytearray(file_obj)))]

else:

if filepath.lower().endswith(".pdf"):

if IS_PDF2IMAGE_INSTALLED:

if IS_PDF_RENDERING_ENABLED:

images = convert_from_path(filepath)

images = rasterize_pdf(filepath)

else:

raise MissingDependencyException(

"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."

)

else:

images = [Image.open(filepath)]

if not images:

raise UnhandledCaseException(f"Could not get any images from {filepath}")

return images

def detect_document_text(

self, file_source, s3_output_path: str = "", save_image: bool = True

self, file_source, save_image: bool = True

) -> Document:

"""

Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.

This function is ideal for single page PDFs or single images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image

:type file_source: str or PIL.Image, required

:param s3_output_path: S3 path to store the output.

:type s3_output_path: str, optional

:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional

and necessary only if the customer wants to visualize bounding boxes for their document entities.

:type save_image: bool

:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract

DetectDocumentText API stored within it.

:rtype: Document

"""

if isinstance(file_source, list) and len(file_source) > 1:

raise IncorrectMethodException(

"List contains more than 1 image. Call start_document_text_detection instead."

)

elif isinstance(file_source, str):

logging.debug("Filepath given.")

logger.debug("Filepath given.")

images = self._get_document_images_from_path(file_source)

if not save_image and file_source.lower().endswith(".pdf"):

if len(images) > 1:

images = []

raise IncorrectMethodException(

else:

"Input contains more than 1 page. Call start_document_text_detection instead."

images = self._get_document_images_from_path(file_source)

)

if len(images) > 1:

file_source = _image_to_byte_array(images[0])

raise IncorrectMethodException(

"Input contains more than 1 page. Call start_document_analysis() instead."

)

file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):

logging.debug("PIL Image given.")

logger.debug("PIL Image given.")

images = [file_source]

file_source = _image_to_byte_array(file_source)

elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):

logging.debug("List of PIL Image given.")

logger.debug("List of PIL Image given.")

images = deepcopy(file_source)

file_source = _image_to_byte_array(images[0])

else:

images = []

raise InputError("Input file_source format not supported.")

if not s3_output_path:

output_config = None

else:

bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)

output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

try:

response = call_textract(

input_document=file_source,

features=[],

queries_config=None, # not supported yet

output_config=output_config,

output_config=None,

kms_key_id=self.kms_key_id,

job_tag="",

notification_channel=None, # not supported yet

client_request_token="",

return_job_id=False,

force_async_api=False,

call_mode=Textract_Call_Mode.FORCE_SYNC,

boto3_textract_client=self.textract_client,

job_done_polling_interval=0,

)

except Exception as exception:

if exception.__class__.__name__ == "InvalidS3ObjectException":

raise RegionMismatchError(

raise InvalidS3ObjectException(

"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."

"Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."

)

elif exception.__class__.__name__ == "UnsupportedDocumentException":

raise UnsupportedDocumentException(

"Textract returned UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_text_detection. If your file_source is an image, make sure that it is not larger than 5MB."

)

raise exception

document = response_parser.parse(response)

document.response = response

if save_image:

for page in document.pages:

page.image = images[document.pages.index(page)]

return document

def start_document_text_detection(

self,

file_source: Union[str, bytes, Image.Image],

s3_output_path: str = "",

s3_upload_path: str = "",

client_request_token: str = "",

job_tag: str = "",

save_image: bool = True,

) -> LazyDocument:

"""

Make a call to the ASYNC StartDocumentTextDetection API.

:param file_source: File bytes, path to a file stored locally or in an S3 bucket

:type file_source: Union[str, bytes, Image.Image], required

:param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).

:type s3_output_path: str

:param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded

under a uuid. If not given the data is expected to be already in s3

:type s3_upload_path: str, optional

:param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token

with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken

to prevent the same. job from being accidentally started more than once.

:type client_request_token: str, optional

:param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.

:type job_tag: str, optional

:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional

and necessary only if the customer wants to visualize bounding boxes for their document entities.

:type save_image: bool

:return: Returns a job id which can be used to fetch the results

:return: Lazy-loaded Document object

:rtype: str

:rtype: LazyDocument

"""

original_file_source = file_source

if not isinstance(file_source, (str, bytes, Image.Image)):

raise InputError(

f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"

)

# If the file is not already in S3

if not isinstance(file_source, str) or not file_source.startswith("s3://"):

# Check if the user has given us a bucket to upload to

if not s3_upload_path:

raise InputError(

"For files not in S3, an S3 upload path must be provided"

)

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))

upload_to_s3(self.s3_client, s3_file_path, file_source)

file_source = s3_file_path

output_config = None

if s3_output_path:

s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)

output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)

try:

response = call_textract(

input_document=file_source,

features=[],

queries_config=None, # not supported yet

output_config=output_config,

kms_key_id=self.kms_key_id,

job_tag=job_tag,

notification_channel=None, # not supported yet

client_request_token=client_request_token,

return_job_id=True,

force_async_api=True,

call_mode=Textract_Call_Mode.FORCE_ASYNC,

boto3_textract_client=self.textract_client,

job_done_polling_interval=1,

)

except Exception as exception:

if exception.__class__.__name__ == "InvalidS3ObjectException":

raise RegionMismatchError(

raise InvalidS3ObjectException(

"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."

"Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."

)

raise exception

images = None

if save_image:

if isinstance(original_file_source, Image.Image):

images = [original_file_source]

elif (

isinstance(original_file_source, list)

and len(original_file_source)

and isinstance(original_file_source[0], Image.Image)

images = original_file_source

else:

images = self._get_document_images_from_path(original_file_source)

return LazyDocument(

response["JobId"],

TextractAPI.DETECT_TEXT,

textract_client=self.textract_client,

images=images,

)

def analyze_document(

self,

file_source,

features,

queries: Union[QueriesConfig, List[Query], List[str]] = None,

s3_output_path: str = "",

save_image: bool = True,

) -> Document:

"""

Make a call to the SYNC AnalyzeDocument API, implicitly parses the response and produces a :class:`Document` object.

This function is ideal for single page PDFs or single images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image

:type file_source: str or PIL.Image, required

:param features: List of TextractFeatures to be extracted from the Document by the TextractAPI

:type features: list, required

:param queries: Queries to run on the document

:type features: Union[QueriesConfig, List[Query], List[str]]

:type queries: Union[QueriesConfig, List[Query], List[str]]

:param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).

:type s3_output_path: str, optional

:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional

and necessary only if the customer wants to visualize bounding boxes for their document entities.

:type save_image: bool

:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract

AnalyzeDocument API stored within it.

:rtype: Document

"""

if isinstance(file_source, list) and len(file_source) > 1:

raise IncorrectMethodException(

"List contains more than 1 image. Call start_document_analysis() instead."

)

elif isinstance(file_source, str):

logging.debug("Filepath given.")

logger.debug("Filepath given.")

images = self._get_document_images_from_path(file_source)

if not save_image and file_source.lower().endswith(".pdf"):

if len(images) > 1:

images = []

raise IncorrectMethodException(

else:

"Input contains more than 1 page. Call start_document_analysis() instead."

images = self._get_document_images_from_path(file_source)

)

if len(images) > 1:

file_source = _image_to_byte_array(images[0])

raise IncorrectMethodException(

"Input contains more than 1 page. Call start_document_analysis() instead."

)

file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):

logging.debug("PIL Image given.")

logger.debug("PIL Image given.")

images = [file_source]

file_source = _image_to_byte_array(file_source)

elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):

logging.debug("List of PIL Image given.")

logger.debug("List of PIL Image given.")

images = deepcopy(file_source)

file_source = _image_to_byte_array(images[0])

else:

images = []

raise InputError("Input file_source format not supported.")

if not s3_output_path:

output_config = None

else:

bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)

output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

if not isinstance(features, list):

features = [features]

if queries and TextractFeatures.QUERIES not in features:

raise InputError(

"Queries were given as a parameter but QUERIES is not enabled in the feature set"

)

if queries and not isinstance(queries, QueriesConfig):

if not isinstance(queries, List):

raise InputError(

f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"

)

if isinstance(queries[0], Query):

queries_config = QueriesConfig(queries)

queries = queries_config

elif isinstance(queries[0], str):

queries_config = QueriesConfig([Query(query) for query in queries])

queries = queries_config

else:

raise InputError(

f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"

)

try:

response = call_textract(

input_document=file_source,

features=features,

queries_config=queries, # not supported yet

output_config=output_config,

output_config=None,

kms_key_id=self.kms_key_id,

job_tag="",

notification_channel=None, # not supported yet

client_request_token="",

return_job_id=False,

force_async_api=False,

call_mode=Textract_Call_Mode.FORCE_SYNC,

boto3_textract_client=self.textract_client,

job_done_polling_interval=0,

)

except Exception as exception:

if exception.__class__.__name__ == "InvalidS3ObjectException":

raise RegionMismatchError(

raise InvalidS3ObjectException(

"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."

"Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."

)

elif exception.__class__.__name__ == "UnsupportedDocumentException":

raise UnsupportedDocumentException(

"Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_analysis. If your file_source is an image, make sure that it is not larger than 5MB."

)

raise exception

document = response_parser.parse(response)

document.response = response

if save_image:

for page in document.pages:

page.image = images[document.pages.index(page)]

return document

def start_document_analysis(

self,

file_source: Union[str, bytes, Image.Image],

features,

s3_output_path: str = "",

s3_upload_path: str = "",

queries: Union[QueriesConfig, List[Query], List[str]] = None,

client_request_token: str = "",

job_tag: str = "",

save_image: bool = True,

) -> LazyDocument:

"""

Make a call to the ASYNC StartDocumentAnalysis API, implicitly parses the response and produces a :class:`Document` object.

This function is ideal for multipage PDFs or an image.

:param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image

:type file_source: Union[str, bytes, Image.Image], required

:param features: List of TextractFeatures to be extracted from the Document by the TextractAPI

:type features: list, required

:param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).

:type s3_output_path: str

:param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded

under a uuid. If not given the data is expected to be already in s3

:type s3_upload_path: str, optional

:param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token

with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken

to prevent the same. job from being accidentally started more than once.

:type client_request_token: str, optional

:param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.

:type job_tag: str, optional

:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional

and necessary only if the customer wants to visualize bounding boxes for their document entities.

:type save_image: bool

:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract

StartDocumentAnalysis API stored within it.

:rtype: Document

"""

original_file_source = file_source

if not isinstance(file_source, (str, bytes, Image.Image)):

raise InputError(

f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"

)

# If the file is not already in S3

if not isinstance(file_source, str) or not file_source.startswith("s3://"):

# Check if the user has given us a bucket to upload to

if not s3_upload_path:

raise InputError(

f"For files not in S3, an S3 upload path must be provided"

)

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))

upload_to_s3(self.s3_client, s3_file_path, file_source)

file_source = s3_file_path

output_config = None

if s3_output_path:

s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)

output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)

if not isinstance(features, list):

features = [features]

if queries and TextractFeatures.QUERIES not in features:

raise InputError(

"Queries were given as a parameter but QUERIES is not enabled in the feature set"

)

if queries and not isinstance(queries, QueriesConfig):

if not isinstance(queries, List):

raise InputError(

f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"

)

if isinstance(queries[0], Query):

queries_config = QueriesConfig(queries)

queries = queries_config

elif isinstance(queries[0], str):

queries_config = QueriesConfig([Query(query) for query in queries])

queries = queries_config

else:

raise InputError(

f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"

f"Queries must be of type QueriesC

)

try:

response = call_textract(

input_document=file_source,

features=features,

queries_config=queries, # not supported yet

output_config=output_config,

kms_key_id=self.kms_key_id,

job_tag=job_tag,

notification_channel=None, # not supported yet

client_request_token=client_request_token,

return_job_id=True,

force_async_api=True,

call_mode=Textract_Call_Mode.FORCE_ASYNC,

boto3_textract_client=self.textract_client,

job_done_polling_interval=1,

)

except Exception as exception:

if exception.__class__.__name__ == "InvalidS3ObjectException":

raise RegionMismatchError(

"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."

)

raise exception

images = None

if save_image:

if isinstance(original_file_source, Image.Image):

images = [original_file_source]

elif (

Diferencias guardadas

Texto original

Abrir archivo

"""
:class:`Textractor` is the main class associated with this package. It needs to be instantiated before using any of the functionalities
the package provides. The main use of this class is to make calls to the Textract API and create Python objects for all the
document entities that are returned in the JSON output of the API. The response received is implicitly parsed and a :class:`Document` type 
object is returned containing all the document entities, their associated relationships and metadata.

The Textract API and Textractor method mapping is as below. Use these wrappers to make calls and parse the responses
in one step.

* (SYNC) DetectDocumentText : detect_document_text
* (SYNC) AnalyzeDocument : analyze_document
* (SYNC) AnalyzeID : analyze_id
* (SYNC) AnalyzeExpense : analyze_expense
* (ASYNC) StartDocumentTextDetection : start_document_text_detection
* (ASYNC) StartDocumentAnalysis : start_document_analysis
* (ASYNC) StartExpenseAnalysis : start_expense_analysis

"""

try:
    from pdf2image import convert_from_bytes, convert_from_path

IS_PDF2IMAGE_INSTALLED = True
except ImportError:
    IS_PDF2IMAGE_INSTALLED = False
    logging.info("pdf2image is not installed, client-side PDF rasterizing is disabled")

from textractor.data.constants import (
    TextractAPI,
    TextractFeatures,
)
from textractor.entities.document import Document
from textractor.entities.lazy_document import LazyDocument
from textractor.parsers import response_parser
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.exceptions import (
    InputError,
    RegionMismatchError,
    IncorrectMethodException,
    MissingDependencyException,
    UnhandledCaseException,
)

class Textractor:
    """
    Initializes the customer credentials needed to make calls to Textract using boto3 package internally.

:param profile_name: Customer's profile name as set in the ~/.aws/config file. This profile typically contains this format.
                                :code:`[default]
                                region = us-west-2
                                output=json`
    :type profile_name: str
    :param region_name: If AWSCLI isn't setup, the user can pass region to let boto3 pick up credentials from the system.
    :param region_name: str
    :type profile_name: str, optional
    :param kms_key_id: Customer's AWS KMS key (cryptographic key)
    :type kms_key_id: str, optional
    """

def __init__(
        self,
        profile_name: str = None,
        region_name: str = None,
        kms_key_id: str = "",
    ):
        self.profile_name = profile_name
        self.region_name = region_name
        self.kms_key_id = kms_key_id

if self.profile_name is not None:
            self.session = boto3.session.Session(profile_name=self.profile_name)
        elif self.region_name is not None:
            self.session = boto3.session.Session(region_name=self.region_name)
        else:
            raise InputError(
                "Unable to initiate Textractor. Either profile_name or region requires an input parameter."
            )
        if self.region_name is not None:
            self.textract_client = self.session.client("textract", region_name=self.region_name)
        else:
            self.textract_client = self.session.client("textract")
        self.s3_client = self.session.client("s3")

def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
        """
        Converts the every page in the document to an image. It supports pdfs and image formats that can be opened by
        PIL package. Documents can be stored in the local computer or on an S3 Bucket.

:param filepath: filepath to the document stored locally or on an S3 bucket.
        :type filepath: str, required
        :return: Returns a list of PIL Images, one for each page of the document
        :rtype: List[Image]
        """
        images = []
        if "s3://" in filepath:
            edit_filepath = filepath.replace("s3://", "")
            bucket = edit_filepath.split("/")[0]
            key = edit_filepath[edit_filepath.index("/") + 1 :]

s3_client = (
                boto3.session.Session(profile_name=self.profile_name).client("s3")
                if self.profile_name is not None
                else boto3.session.Session(region_name=self.region_name).client("s3")
            )
            file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
            if filepath.lower().endswith(".pdf"):
                if IS_PDF2IMAGE_INSTALLED:
                    images = convert_from_bytes(bytearray(file_obj))
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(io.BytesIO(bytearray(file_obj)))]

else:
            if filepath.lower().endswith(".pdf"):
                if IS_PDF2IMAGE_INSTALLED:
                    images = convert_from_path(filepath)
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(filepath)]

if not images:
            raise UnhandledCaseException(f"Could not get any images from {filepath}")

return images

def detect_document_text(
        self, file_source, s3_output_path: str = "", save_image: bool = True
    ) -> Document:
        """
        Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for single page PDFs or single images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: str or PIL.Image, required
        :param s3_output_path: S3 path to store the output.
        :type s3_output_path: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
                 DetectDocumentText API stored within it.
        :rtype: Document
        """

if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_document_text_detection instead."
            )

elif isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_document_text_detection instead."
                )
            file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source]
            file_source = _image_to_byte_array(file_source)

elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logging.debug("List of PIL Image given.")
            images = deepcopy(file_source)
            file_source = _image_to_byte_array(images[0])

else:
            images = []
            raise InputError("Input file_source format not supported.")

if not s3_output_path:
            output_config = None
        else:
            bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

try:
            response = call_textract(
                input_document=file_source,
                features=[],
                queries_config=None,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                call_mode=Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

document = response_parser.parse(response)
        document.response = response
        if save_image:
            for page in document.pages:
                page.image = images[document.pages.index(page)]
        return document

def start_document_text_detection(
        self,
        file_source: Union[str, bytes, Image.Image],
        s3_output_path: str = "",
        s3_upload_path: str = "",
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ):
        """
        Make a call to the ASYNC StartDocumentTextDetection API.

:param file_source: File bytes, path to a file stored locally or in an S3 bucket
        :type file_source: Union[str, bytes, Image.Image], required
        :param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                                    under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional

:return: Returns a job id which can be used to fetch the results
        :rtype: str
        """

original_file_source = file_source

if not isinstance(file_source, (str, bytes, Image.Image)):
            raise InputError(
                f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
            )

# If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    "For files not in S3, an S3 upload path must be provided"
                )

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

output_config = None
        if s3_output_path:
            s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
            output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)

try:
            response = call_textract(
                input_document=file_source,
                features=[],
                queries_config=None,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                call_mode=Textract_Call_Mode.FORCE_ASYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

images = None
        if save_image:
            if isinstance(original_file_source, Image.Image):
                images = [original_file_source]
            elif (
                isinstance(original_file_source, list)
                and len(original_file_source)
                and isinstance(original_file_source[0], Image.Image)
            ):
                images = original_file_source
            else:
                images = self._get_document_images_from_path(original_file_source)

return LazyDocument(
            response["JobId"],
            TextractAPI.DETECT_TEXT,
            textract_client=self.textract_client,
            images=images,
        )

def analyze_document(
        self,
        file_source,
        features,
        queries: Union[QueriesConfig, List[Query], List[str]] = None,
        s3_output_path: str = "",
        save_image: bool = True,
    ) -> Document:
        """
        Make a call to the SYNC AnalyzeDocument API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for single page PDFs or single images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: str or PIL.Image, required
        :param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
        :type features: list, required
        :param queries: Queries to run on the document
        :type features: Union[QueriesConfig, List[Query], List[str]]
        :param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
                 AnalyzeDocument API stored within it.
        :rtype: Document
        """
        if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_document_analysis() instead."
            )

elif isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_document_analysis() instead."
                )
            file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source]
            file_source = _image_to_byte_array(file_source)

else:
            images = []
            raise InputError("Input file_source format not supported.")

if not isinstance(features, list):
            features = [features]

if queries and TextractFeatures.QUERIES not in features:
            raise InputError(
                "Queries were given as a parameter but QUERIES is not enabled in the feature set"
            )

if queries and not isinstance(queries, QueriesConfig):
            if not isinstance(queries, List):
                raise InputError(
                    f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
                )
            if isinstance(queries[0], Query):
                queries_config = QueriesConfig(queries)
                queries = queries_config
            elif isinstance(queries[0], str):
                queries_config = QueriesConfig([Query(query) for query in queries])
                queries = queries_config
            else:
                raise InputError(
                    f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
                )

try:
            response = call_textract(
                input_document=file_source,
                features=features,
                queries_config=queries,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                call_mode=Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

def start_document_analysis(
        self,
        file_source: Union[str, bytes, Image.Image],
        features,
        s3_output_path: str = "",
        s3_upload_path: str = "",
        queries: Union[QueriesConfig, List[Query], List[str]] = None,
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ) -> LazyDocument:
        """
        Make a call to the ASYNC StartDocumentAnalysis API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or an image.

:param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
        :type file_source: Union[str, bytes, Image.Image], required
        :param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
        :type features: list, required
        :param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                               under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
                 StartDocumentAnalysis API stored within it.
        :rtype: Document
        """

original_file_source = file_source

# If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    f"For files not in S3, an S3 upload path must be provided"
                )

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

if not isinstance(features, list):
            features = [features]

if queries and TextractFeatures.QUERIES not in features:
            raise InputError(
                "Queries were given as a parameter but QUERIES is not enabled in the feature set"
            )

try:
            response = call_textract(
                input_document=file_source,
                features=features,
                queries_config=queries,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                call_mode=Textract_Call_Mode.FORCE_ASYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

return LazyDocument(
            response["JobId"],
            TextractAPI.ANALYZE,
            textract_client=self.textract_client,
            images=images,
        )

def analyze_id(
        self,
        file_source: Union[str, List[Image.Image], List[str]],
        save_image: bool = True,
    ) -> Document:
        """AnalyzeID parses identity documents such as passports and driver's license and
        returns the result as a dictionary of standardized fields. See https://docs.aws.amazon.com/textract/latest/dg/identitydocumentfields.html
        for a complete list.

:param file_source: Path to a file stored locally, on an S3 bucket or list of PIL Images
        :type file_source: Union[str, List[Image.Image], List[str]]
        :param save_image: Saves the images in the returned Document object for visualizing the results, defaults to False
        :type save_image: bool, optional
        :raises InputError: Raised when the file_source could not be parsed
        :raises RegionMismatchError: Raised when the S3 object passed as file source is in a region that does not match the one used to create the Textractor object.
        :raises exception: Raised when the Textract call fails
        :return: Document
        :rtype: Document
        """
        if isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
        elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source]
        elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logging.debug("List of PIL Image given.")
            # FIXME: Is this needed?
            images = deepcopy(file_source)
        else:
            images = []
            raise InputError("Input file_source format not supported.")

images_bytes = [_image_to_byte_array(image) for image in images]

try:
            response = call_textract_analyzeid(
                document_pages=images_bytes,
                boto3_textract_client=self.textract_client,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

def analyze_expense(
        self,
        file_source: Union[str, List[Image.Image], List[str]],
        s3_output_path: str = "",
        save_image: bool = True,
    ):
        """Make a call to the SYNC AnalyzeExpense API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or list of images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: Union[str, List[Image.Image], List[str]]
        :param s3_output_path: S3 output path. When used the job output is save to the given S3 path, defaults to ""
        :type s3_output_path: str, optional
        :param save_image: Whether to keep the file source as PIL Images inside the returned Document object, defaults to False
        :type save_image: bool, optional
        :raises IncorrectMethodException: Raised when the file source type is incompatible with the Textract API being called
        :raises InputError: Raised when the file source type is invalid
        :raises RegionMismatchError: Raised when the file source region is different the API region.
        :raises exception: Raised if the Textract API call fails
        :return: Document
        :rtype: Document
        """
        if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_expense_analysis instead."
            )

elif isinstance(file_source, str):
            logging.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_expense_analysis instead."
                )
            file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):
            logging.debug("PIL Image given.")
            images = [file_source.copy()]
            file_source = _image_to_byte_array(file_source)

else:
            images = []
            raise InputError("Input file_source format not supported.")

try:
            response = call_textract_expense(
                input_document=file_source,
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

def start_expense_analysis(
        self,
        file_source: Union[str, bytes, Image.Image],
        s3_output_path: str = "",
        s3_upload_path: str = "",
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ) -> LazyDocument:
        """Make a call to the ASYNC StartExpenseAnalysis API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or an image.

:param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
        :type file_source: Union[str, bytes, Image.Image]
        :param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                               under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool
        :raises InputError: Raised when the file source type is invalid
        :raises RegionMismatchError: Raised when the file source region is different the API region.
        :raises exception: Raised if the Textract API call fails
        :return: Lazy-loaded Document object
        :rtype: LazyDocument
        """

original_file_source = file_source

# If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    f"For files not in S3, an S3 upload path must be provided"
                )

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

try:
            response = call_textract_expense(
                input_document=file_source,
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise RegionMismatchError(
                    "Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
                )
            raise exception

return LazyDocument(
            response["JobId"],
            TextractAPI.EXPENSE,
            textract_client=self.textract_client,
            images=images,
        )

def get_result(
        self, job_id: str, api: Union[TextractAPI, Textract_API]
    ) -> Document:
        """
        Retrieves Textract API output for a given job id.
        :param job_id: Textract API JobID
        :type job_id: str, required
        :return: Returns a Document object
        :rtype: Document
        """

response = get_full_json(
            job_id,
            TextractAPI.TextractAPI_to_Textract_API(api)
            if isinstance(api, TextractAPI)
            else api,
            boto3_textract_client=self.textract_client,
            job_done_polling_interval=1,
        )

document = response_parser.parse(response)
        document.response = response

return document

def _image_to_byte_array(image: Image) -> bytes:
    """
    Function to convert PIL.Image to bytearray for processing Document using Textract service.
    :param image: Image to be converted to bytearray
    :type image: PIL.Image, required
    :return: Returns a bytearray of the input image
    :rtype: bytes
    """
    img_byte_arr = io.BytesIO()
    image.convert("RGB").save(img_byte_arr, format="JPEG")
    img_byte_arr = img_byte_arr.getvalue()
    return img_byte_arr

Texto modificado

Abrir archivo

The Textract API and Textractor method mapping is as below. Use these wrappers to make calls and parse the responses
in one step.

"""

import io
import os
import boto3
import logging
import uuid
from PIL import Image
from copy import deepcopy
from typing import List, Union
from textractcaller import (
    call_textract,
    call_textract_analyzeid,
    call_textract_expense,
    OutputConfig,
    Query,
    QueriesConfig,
)
from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json
from textractor.data.constants import (
    TextractAPI,
    TextractFeatures,
)
from textractor.entities.document import Document
from textractor.entities.lazy_document import LazyDocument
from textractor.parsers import response_parser
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.utils.pdf_utils import rasterize_pdf
from textractor.exceptions import (
    InputError,
    IncorrectMethodException,
    MissingDependencyException,
    UnhandledCaseException,
    UnsupportedDocumentException,
    InvalidS3ObjectException,
)

logger = logging.getLogger(__name__)

try:
    try:
        import pypdfium2
    except:
        import pdf2image

IS_PDF_RENDERING_ENABLED = True
except ImportError:
    IS_PDF_RENDERING_ENABLED = False
    logger.info("pypdfium2 and pdf2image are both not installed, client-side PDF rasterizing is disabled")

class Textractor:
    """
    Initializes the customer credentials needed to make calls to Textract using boto3 package internally.

if self.profile_name is not None:
            self.session = boto3.session.Session(profile_name=self.profile_name)
        elif self.region_name is not None:
            self.session = boto3.session.Session(region_name=self.region_name)
        elif os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"):
            # We support both AWS_REGION and AWS_DEFAULT_REGION, with AWS_REGION having precedence.
            self.region_name = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION")
            self.session = boto3.session.Session(region_name=self.region_name)
        else:
            raise InputError(
                "Unable to initiate Textractor. Either profile_name or region requires an input parameter."
            )
        if self.region_name is not None:
            self.textract_client = self.session.client(
                "textract", region_name=self.region_name
            )
        else:
            self.textract_client = self.session.client("textract")
        self.s3_client = self.session.client("s3")

s3_client = (
                boto3.session.Session(profile_name=self.profile_name).client("s3")
                if self.profile_name is not None
                else boto3.session.Session(region_name=self.region_name).client("s3")
            )
            file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
            if filepath.lower().endswith(".pdf"):
                if IS_PDF_RENDERING_ENABLED:
                    images = rasterize_pdf(file_obj)
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(io.BytesIO(bytearray(file_obj)))]

else:
            if filepath.lower().endswith(".pdf"):
                if IS_PDF_RENDERING_ENABLED:
                    images = rasterize_pdf(filepath)
                else:
                    raise MissingDependencyException(
                        "pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
                    )
            else:
                images = [Image.open(filepath)]

if not images:
            raise UnhandledCaseException(f"Could not get any images from {filepath}")

return images

def detect_document_text(
        self, file_source, save_image: bool = True
    ) -> Document:
        """
        Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for single page PDFs or single images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: str or PIL.Image, required
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

elif isinstance(file_source, str):
            logger.debug("Filepath given.")
            if not save_image and file_source.lower().endswith(".pdf"):
                images = []
            else:
                images = self._get_document_images_from_path(file_source)
                if len(images) > 1:
                    raise IncorrectMethodException(
                        "Input contains more than 1 page. Call start_document_analysis() instead."
                    )
                file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):
            logger.debug("PIL Image given.")
            images = [file_source]
            file_source = _image_to_byte_array(file_source)

elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logger.debug("List of PIL Image given.")
            images = deepcopy(file_source)
            file_source = _image_to_byte_array(images[0])

else:
            images = []
            raise InputError("Input file_source format not supported.")

try:
            response = call_textract(
                input_document=file_source,
                features=[],
                queries_config=None,  # not supported yet
                output_config=None,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                call_mode=Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            elif exception.__class__.__name__ == "UnsupportedDocumentException":
                raise UnsupportedDocumentException(
                    "Textract returned UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_text_detection. If your file_source is an image, make sure that it is not larger than 5MB."
                )
            raise exception

def start_document_text_detection(
        self,
        file_source: Union[str, bytes, Image.Image],
        s3_output_path: str = "",
        s3_upload_path: str = "",
        client_request_token: str = "",
        job_tag: str = "",
        save_image: bool = True,
    ) -> LazyDocument:
        """
        Make a call to the ASYNC StartDocumentTextDetection API.

:return: Lazy-loaded Document object
        :rtype: LazyDocument
        """

original_file_source = file_source

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

try:
            response = call_textract(
                input_document=file_source,
                features=[],
                queries_config=None,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                call_mode=Textract_Call_Mode.FORCE_ASYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            raise exception

return LazyDocument(
            response["JobId"],
            TextractAPI.DETECT_TEXT,
            textract_client=self.textract_client,
            images=images,
        )

def analyze_document(
        self,
        file_source,
        features,
        queries: Union[QueriesConfig, List[Query], List[str]] = None,
        save_image: bool = True,
    ) -> Document:
        """
        Make a call to the SYNC AnalyzeDocument API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for single page PDFs or single images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: str or PIL.Image, required
        :param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
        :type features: list, required
        :param queries: Queries to run on the document
        :type queries: Union[QueriesConfig, List[Query], List[str]]
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool

else:
            images = []
            raise InputError("Input file_source format not supported.")

if not isinstance(features, list):
            features = [features]

if queries and TextractFeatures.QUERIES not in features:
            raise InputError(
                "Queries were given as a parameter but QUERIES is not enabled in the feature set"
            )

try:
            response = call_textract(
                input_document=file_source,
                features=features,
                queries_config=queries,  # not supported yet
                output_config=None,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                call_mode=Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            elif exception.__class__.__name__ == "UnsupportedDocumentException":
                raise UnsupportedDocumentException(
                    "Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_analysis. If your file_source is an image, make sure that it is not larger than 5MB."
                )
            raise exception

original_file_source = file_source

# If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    f"For files not in S3, an S3 upload path must be provided"
                )

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

if not isinstance(features, list):
            features = [features]

if queries and TextractFeatures.QUERIES not in features:
            raise InputError(
                "Queries were given as a parameter but QUERIES is not enabled in the feature set"
            )

try:
            response = call_textract(
                input_document=file_source,
                features=features,
                queries_config=queries,  # not supported yet
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                call_mode=Textract_Call_Mode.FORCE_ASYNC,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            raise exception

return LazyDocument(
            response["JobId"],
            TextractAPI.ANALYZE,
            textract_client=self.textract_client,
            images=images,
            output_config=output_config,
        )

:param file_source: Path to a file stored locally, on an S3 bucket or list of PIL Images
        :type file_source: Union[str, List[Image.Image], List[str]]
        :param save_image: Saves the images in the returned Document object for visualizing the results, defaults to False
        :type save_image: bool, optional
        :raises InputError: Raised when the file_source could not be parsed
        :raises InvalidS3ObjectException: Raised when the S3 object passed as file source is in a region that does not match the one used to create the Textractor object.
        :raises exception: Raised when the Textract call fails
        :return: Document
        :rtype: Document
        """
        if isinstance(file_source, str):
            logger.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
        elif isinstance(file_source, Image.Image):
            logger.debug("PIL Image given.")
            images = [file_source]
        elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
            logger.debug("List of PIL Image given.")
            # FIXME: Is this needed?
            images = deepcopy(file_source)
        else:
            images = []
            raise InputError("Input file_source format not supported.")

images_bytes = [_image_to_byte_array(image) for image in images]

try:
            response = call_textract_analyzeid(
                document_pages=images_bytes,
                boto3_textract_client=self.textract_client,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            raise exception

def analyze_expense(
        self,
        file_source: Union[str, List[Image.Image], List[str]],
        save_image: bool = True,
    ):
        """Make a call to the SYNC AnalyzeExpense API, implicitly parses the response and produces a :class:`Document` object.
        This function is ideal for multipage PDFs or list of images.

:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
        :type file_source: Union[str, List[Image.Image], List[str]]
        :param save_image: Whether to keep the file source as PIL Images inside the returned Document object, defaults to False
        :type save_image: bool, optional
        :raises IncorrectMethodException: Raised when the file source type is incompatible with the Textract API being called
        :raises InputError: Raised when the file source type is invalid
        :raises InvalidS3ObjectException: Raised when the file source region is different the API region.
        :raises exception: Raised if the Textract API call fails
        :return: Document
        :rtype: Document
        """
        if isinstance(file_source, list) and len(file_source) > 1:
            raise IncorrectMethodException(
                "List contains more than 1 image. Call start_expense_analysis instead."
            )

elif isinstance(file_source, str):
            logger.debug("Filepath given.")
            images = self._get_document_images_from_path(file_source)
            if len(images) > 1:
                raise IncorrectMethodException(
                    "Input contains more than 1 page. Call start_expense_analysis instead."
                )
            file_source = _image_to_byte_array(images[0])

elif isinstance(file_source, Image.Image):
            logger.debug("PIL Image given.")
            images = [file_source.copy()]
            file_source = _image_to_byte_array(file_source)

else:
            images = []
            raise InputError("Input file_source format not supported.")

output_config = None

try:
            response = call_textract_expense(
                input_document=file_source,
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag="",
                notification_channel=None,  # not supported yet
                client_request_token="",
                return_job_id=False,
                force_async_api=False,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=0,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            raise exception

:param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
        :type file_source: Union[str, bytes, Image.Image]
        :param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
        :type s3_output_path: str
        :param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
                               under a uuid. If not given the data is expected to be already in s3
        :type s3_upload_path: str, optional
        :param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
                                    with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
                                    to prevent the same. job from being accidentally started more than once.
        :type client_request_token: str, optional
        :param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
        :type job_tag: str, optional
        :param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
                            and necessary only if the customer wants to visualize bounding boxes for their document entities.
        :type save_image: bool
        :raises InputError: Raised when the file source type is invalid
        :raises InvalidS3ObjectException: Raised when the file source region is different the API region.
        :raises exception: Raised if the Textract API call fails
        :return: Lazy-loaded Document object
        :rtype: LazyDocument
        """

original_file_source = file_source

# If the file is not already in S3
        if not isinstance(file_source, str) or not file_source.startswith("s3://"):
            # Check if the user has given us a bucket to upload to
            if not s3_upload_path:
                raise InputError(
                    f"For files not in S3, an S3 upload path must be provided"
                )

s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
            upload_to_s3(self.s3_client, s3_file_path, file_source)
            file_source = s3_file_path

try:
            response = call_textract_expense(
                input_document=file_source,
                output_config=output_config,
                kms_key_id=self.kms_key_id,
                job_tag=job_tag,
                notification_channel=None,  # not supported yet
                client_request_token=client_request_token,
                return_job_id=True,
                force_async_api=True,
                boto3_textract_client=self.textract_client,
                job_done_polling_interval=1,
            )
        except Exception as exception:
            if exception.__class__.__name__ == "InvalidS3ObjectException":
                raise InvalidS3ObjectException(
                    "Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
                )
            raise exception

return LazyDocument(
            response["JobId"],
            TextractAPI.EXPENSE,
            textract_client=self.textract_client,
            images=images,
        )

document = response_parser.parse(response)
        document.response = response

return document

def _image_to_byte_array(image: Image) -> bytes:
    """
    Function to convert PIL.Image to bytearray for processing Document using Textract service.
    :param image: Image to be converted to bytearray
    :type image: PIL.Image, required
    :return: Returns a bytearray of the input image
    :rtype: bytes
    """
    img_byte_arr = io.BytesIO()
    # We set quality to 95 and subsampling to 0 because the pillow defaults are very low resolution
    image.convert("RGB").save(img_byte_arr, format="JPEG", quality=95, subsampling=0)
    img_byte_arr = img_byte_arr.getvalue()
    return img_byte_arr