textractor.py

Created Diff never expires
93 eliminaciones
Líneas
Total
Eliminado
Palabras
Total
Eliminado
Para continuar usando esta función, actualice a
Diffchecker logo
Diffchecker Pro
563 líneas
83 adiciones
Líneas
Total
Añadido
Palabras
Total
Añadido
Para continuar usando esta función, actualice a
Diffchecker logo
Diffchecker Pro
542 líneas
"""
"""
:class:`Textractor` is the main class associated with this package. It needs to be instantiated before using any of the functionalities
:class:`Textractor` is the main class associated with this package. It needs to be instantiated before using any of the functionalities
the package provides. The main use of this class is to make calls to the Textract API and create Python objects for all the
the package provides. The main use of this class is to make calls to the Textract API and create Python objects for all the
document entities that are returned in the JSON output of the API. The response received is implicitly parsed and a :class:`Document` type
document entities that are returned in the JSON output of the API. The response received is implicitly parsed and a :class:`Document` type
object is returned containing all the document entities, their associated relationships and metadata.
object is returned containing all the document entities, their associated relationships and metadata.


The Textract API and Textractor method mapping is as below. Use these wrappers to make calls and parse the responses
The Textract API and Textractor method mapping is as below. Use these wrappers to make calls and parse the responses
in one step.
in one step.


* (SYNC) DetectDocumentText : detect_document_text
* (SYNC) DetectDocumentText : detect_document_text
* (SYNC) AnalyzeDocument : analyze_document
* (SYNC) AnalyzeDocument : analyze_document
* (SYNC) AnalyzeID : analyze_id
* (SYNC) AnalyzeID : analyze_id
* (SYNC) AnalyzeExpense : analyze_expense
* (SYNC) AnalyzeExpense : analyze_expense
* (ASYNC) StartDocumentTextDetection : start_document_text_detection
* (ASYNC) StartDocumentTextDetection : start_document_text_detection
* (ASYNC) StartDocumentAnalysis : start_document_analysis
* (ASYNC) StartDocumentAnalysis : start_document_analysis
* (ASYNC) StartExpenseAnalysis : start_expense_analysis
* (ASYNC) StartExpenseAnalysis : start_expense_analysis


"""
"""


import io
import io
import os
import os
import boto3
import boto3
import logging
import logging
import uuid
import uuid
from PIL import Image
from PIL import Image
from copy import deepcopy
from copy import deepcopy
from typing import List, Union
from typing import List, Union
from textractcaller import (
from textractcaller import (
call_textract,
call_textract,
call_textract_analyzeid,
call_textract_analyzeid,
call_textract_expense,
call_textract_expense,
OutputConfig,
OutputConfig,
Query,
Query,
QueriesConfig,
QueriesConfig,
)
)
from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json
from textractcaller.t_call import Textract_Call_Mode, Textract_API, get_full_json

try:
from pdf2image import convert_from_bytes, convert_from_path

IS_PDF2IMAGE_INSTALLED = True
except ImportError:
IS_PDF2IMAGE_INSTALLED = False
logging.info("pdf2image is not installed, client-side PDF rasterizing is disabled")

from textractor.data.constants import (
from textractor.data.constants import (
TextractAPI,
TextractAPI,
TextractFeatures,
TextractFeatures,
)
)
from textractor.entities.document import Document
from textractor.entities.document import Document
from textractor.entities.lazy_document import LazyDocument
from textractor.entities.lazy_document import LazyDocument
from textractor.parsers import response_parser
from textractor.parsers import response_parser
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.utils.s3_utils import upload_to_s3, s3_path_to_bucket_and_prefix
from textractor.utils.pdf_utils import rasterize_pdf
from textractor.exceptions import (
from textractor.exceptions import (
InputError,
InputError,
RegionMismatchError,
IncorrectMethodException,
IncorrectMethodException,
MissingDependencyException,
MissingDependencyException,
UnhandledCaseException,
UnhandledCaseException,
UnsupportedDocumentException,
InvalidS3ObjectException,
)
)


logger = logging.getLogger(__name__)

try:
try:
import pypdfium2
except:
import pdf2image

IS_PDF_RENDERING_ENABLED = True
except ImportError:
IS_PDF_RENDERING_ENABLED = False
logger.info("pypdfium2 and pdf2image are both not installed, client-side PDF rasterizing is disabled")


class Textractor:
class Textractor:
"""
"""
Initializes the customer credentials needed to make calls to Textract using boto3 package internally.
Initializes the customer credentials needed to make calls to Textract using boto3 package internally.


:param profile_name: Customer's profile name as set in the ~/.aws/config file. This profile typically contains this format.
:param profile_name: Customer's profile name as set in the ~/.aws/config file. This profile typically contains this format.
:code:`[default]
:code:`[default]
region = us-west-2
region = us-west-2
output=json`
output=json`
:type profile_name: str
:type profile_name: str
:param region_name: If AWSCLI isn't setup, the user can pass region to let boto3 pick up credentials from the system.
:param region_name: If AWSCLI isn't setup, the user can pass region to let boto3 pick up credentials from the system.
:param region_name: str
:param region_name: str
:type profile_name: str, optional
:type profile_name: str, optional
:param kms_key_id: Customer's AWS KMS key (cryptographic key)
:param kms_key_id: Customer's AWS KMS key (cryptographic key)
:type kms_key_id: str, optional
:type kms_key_id: str, optional
"""
"""


def __init__(
def __init__(
self,
self,
profile_name: str = None,
profile_name: str = None,
region_name: str = None,
region_name: str = None,
kms_key_id: str = "",
kms_key_id: str = "",
):
):
self.profile_name = profile_name
self.profile_name = profile_name
self.region_name = region_name
self.region_name = region_name
self.kms_key_id = kms_key_id
self.kms_key_id = kms_key_id


if self.profile_name is not None:
if self.profile_name is not None:
self.session = boto3.session.Session(profile_name=self.profile_name)
self.session = boto3.session.Session(profile_name=self.profile_name)
elif self.region_name is not None:
elif self.region_name is not None:
self.session = boto3.session.Session(region_name=self.region_name)
self.session = boto3.session.Session(region_name=self.region_name)
elif os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"):
# We support both AWS_REGION and AWS_DEFAULT_REGION, with AWS_REGION having precedence.
self.region_name = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION")
self.session = boto3.session.Session(region_name=self.region_name)
else:
else:
raise InputError(
raise InputError(
"Unable to initiate Textractor. Either profile_name or region requires an input parameter."
"Unable to initiate Textractor. Either profile_name or region requires an input parameter."
)
)
if self.region_name is not None:
if self.region_name is not None:
self.textract_client = self.session.client("textract", region_name=self.region_name)
self.textract_client = self.session.client(
"textract", region_name=self.region_name
)
else:
else:
self.textract_client = self.session.client("textract")
self.textract_client = self.session.client("textract")
self.s3_client = self.session.client("s3")
self.s3_client = self.session.client("s3")


def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
"""
"""
Converts the every page in the document to an image. It supports pdfs and image formats that can be opened by
Converts the every page in the document to an image. It supports pdfs and image formats that can be opened by
PIL package. Documents can be stored in the local computer or on an S3 Bucket.
PIL package. Documents can be stored in the local computer or on an S3 Bucket.


:param filepath: filepath to the document stored locally or on an S3 bucket.
:param filepath: filepath to the document stored locally or on an S3 bucket.
:type filepath: str, required
:type filepath: str, required
:return: Returns a list of PIL Images, one for each page of the document
:return: Returns a list of PIL Images, one for each page of the document
:rtype: List[Image]
:rtype: List[Image]
"""
"""
images = []
images = []
if "s3://" in filepath:
if "s3://" in filepath:
edit_filepath = filepath.replace("s3://", "")
edit_filepath = filepath.replace("s3://", "")
bucket = edit_filepath.split("/")[0]
bucket = edit_filepath.split("/")[0]
key = edit_filepath[edit_filepath.index("/") + 1 :]
key = edit_filepath[edit_filepath.index("/") + 1 :]


s3_client = (
s3_client = (
boto3.session.Session(profile_name=self.profile_name).client("s3")
boto3.session.Session(profile_name=self.profile_name).client("s3")
if self.profile_name is not None
if self.profile_name is not None
else boto3.session.Session(region_name=self.region_name).client("s3")
else boto3.session.Session(region_name=self.region_name).client("s3")
)
)
file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
if filepath.lower().endswith(".pdf"):
if filepath.lower().endswith(".pdf"):
if IS_PDF2IMAGE_INSTALLED:
if IS_PDF_RENDERING_ENABLED:
images = convert_from_bytes(bytearray(file_obj))
images = rasterize_pdf(file_obj)
else:
else:
raise MissingDependencyException(
raise MissingDependencyException(
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
)
)
else:
else:
images = [Image.open(io.BytesIO(bytearray(file_obj)))]
images = [Image.open(io.BytesIO(bytearray(file_obj)))]


else:
else:
if filepath.lower().endswith(".pdf"):
if filepath.lower().endswith(".pdf"):
if IS_PDF2IMAGE_INSTALLED:
if IS_PDF_RENDERING_ENABLED:
images = convert_from_path(filepath)
images = rasterize_pdf(filepath)
else:
else:
raise MissingDependencyException(
raise MissingDependencyException(
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
"pdf2image is not installed. If you do not plan on using visualizations you can skip image generation using save_image=False in your function call."
)
)
else:
else:
images = [Image.open(filepath)]
images = [Image.open(filepath)]


if not images:
if not images:
raise UnhandledCaseException(f"Could not get any images from {filepath}")
raise UnhandledCaseException(f"Could not get any images from {filepath}")


return images
return images


def detect_document_text(
def detect_document_text(
self, file_source, s3_output_path: str = "", save_image: bool = True
self, file_source, save_image: bool = True
) -> Document:
) -> Document:
"""
"""
Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
Make a call to the SYNC DetectDocumentText API, implicitly parses the response and produces a :class:`Document` object.
This function is ideal for single page PDFs or single images.
This function is ideal for single page PDFs or single images.


:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
:type file_source: str or PIL.Image, required
:type file_source: str or PIL.Image, required
:param s3_output_path: S3 path to store the output.
:type s3_output_path: str, optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
and necessary only if the customer wants to visualize bounding boxes for their document entities.
and necessary only if the customer wants to visualize bounding boxes for their document entities.
:type save_image: bool
:type save_image: bool


:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
DetectDocumentText API stored within it.
DetectDocumentText API stored within it.
:rtype: Document
:rtype: Document
"""
"""


if isinstance(file_source, list) and len(file_source) > 1:
if isinstance(file_source, list) and len(file_source) > 1:
raise IncorrectMethodException(
raise IncorrectMethodException(
"List contains more than 1 image. Call start_document_text_detection instead."
"List contains more than 1 image. Call start_document_text_detection instead."
)
)


elif isinstance(file_source, str):
elif isinstance(file_source, str):
logging.debug("Filepath given.")
logger.debug("Filepath given.")
images = self._get_document_images_from_path(file_source)
if not save_image and file_source.lower().endswith(".pdf"):
if len(images) > 1:
images = []
raise IncorrectMethodException(
else:
"Input contains more than 1 page. Call start_document_text_detection instead."
images = self._get_document_images_from_path(file_source)
)
if len(images) > 1:
file_source = _image_to_byte_array(images[0])
raise IncorrectMethodException(
"Input contains more than 1 page. Call start_document_analysis() instead."
)
file_source = _image_to_byte_array(images[0])


elif isinstance(file_source, Image.Image):
elif isinstance(file_source, Image.Image):
logging.debug("PIL Image given.")
logger.debug("PIL Image given.")
images = [file_source]
images = [file_source]
file_source = _image_to_byte_array(file_source)
file_source = _image_to_byte_array(file_source)


elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
logging.debug("List of PIL Image given.")
logger.debug("List of PIL Image given.")
images = deepcopy(file_source)
images = deepcopy(file_source)
file_source = _image_to_byte_array(images[0])
file_source = _image_to_byte_array(images[0])


else:
else:
images = []
images = []
raise InputError("Input file_source format not supported.")
raise InputError("Input file_source format not supported.")


if not s3_output_path:
output_config = None
else:
bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

try:
try:
response = call_textract(
response = call_textract(
input_document=file_source,
input_document=file_source,
features=[],
features=[],
queries_config=None, # not supported yet
queries_config=None, # not supported yet
output_config=output_config,
output_config=None,
kms_key_id=self.kms_key_id,
kms_key_id=self.kms_key_id,
job_tag="",
job_tag="",
notification_channel=None, # not supported yet
notification_channel=None, # not supported yet
client_request_token="",
client_request_token="",
return_job_id=False,
return_job_id=False,
force_async_api=False,
force_async_api=False,
call_mode=Textract_Call_Mode.FORCE_SYNC,
call_mode=Textract_Call_Mode.FORCE_SYNC,
boto3_textract_client=self.textract_client,
boto3_textract_client=self.textract_client,
job_done_polling_interval=0,
job_done_polling_interval=0,
)
)
except Exception as exception:
except Exception as exception:
if exception.__class__.__name__ == "InvalidS3ObjectException":
if exception.__class__.__name__ == "InvalidS3ObjectException":
raise RegionMismatchError(
raise InvalidS3ObjectException(
"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
"Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
)
elif exception.__class__.__name__ == "UnsupportedDocumentException":
raise UnsupportedDocumentException(
"Textract returned UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_text_detection. If your file_source is an image, make sure that it is not larger than 5MB."
)
)
raise exception
raise exception


document = response_parser.parse(response)
document = response_parser.parse(response)
document.response = response
document.response = response
if save_image:
if save_image:
for page in document.pages:
for page in document.pages:
page.image = images[document.pages.index(page)]
page.image = images[document.pages.index(page)]
return document
return document


def start_document_text_detection(
def start_document_text_detection(
self,
self,
file_source: Union[str, bytes, Image.Image],
file_source: Union[str, bytes, Image.Image],
s3_output_path: str = "",
s3_output_path: str = "",
s3_upload_path: str = "",
s3_upload_path: str = "",
client_request_token: str = "",
client_request_token: str = "",
job_tag: str = "",
job_tag: str = "",
save_image: bool = True,
save_image: bool = True,
):
) -> LazyDocument:
"""
"""
Make a call to the ASYNC StartDocumentTextDetection API.
Make a call to the ASYNC StartDocumentTextDetection API.


:param file_source: File bytes, path to a file stored locally or in an S3 bucket
:param file_source: File bytes, path to a file stored locally or in an S3 bucket
:type file_source: Union[str, bytes, Image.Image], required
:type file_source: Union[str, bytes, Image.Image], required
:param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
:param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
:type s3_output_path: str
:type s3_output_path: str
:param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
:param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
under a uuid. If not given the data is expected to be already in s3
under a uuid. If not given the data is expected to be already in s3
:type s3_upload_path: str, optional
:type s3_upload_path: str, optional
:param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
:param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
to prevent the same. job from being accidentally started more than once.
to prevent the same. job from being accidentally started more than once.
:type client_request_token: str, optional
:type client_request_token: str, optional
:param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
:param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
:type job_tag: str, optional
:type job_tag: str, optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
and necessary only if the customer wants to visualize bounding boxes for their document entities.
:type save_image: bool


:return: Returns a job id which can be used to fetch the results
:return: Lazy-loaded Document object
:rtype: str
:rtype: LazyDocument
"""
"""


original_file_source = file_source
original_file_source = file_source


if not isinstance(file_source, (str, bytes, Image.Image)):
if not isinstance(file_source, (str, bytes, Image.Image)):
raise InputError(
raise InputError(
f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
)
)


# If the file is not already in S3
# If the file is not already in S3
if not isinstance(file_source, str) or not file_source.startswith("s3://"):
if not isinstance(file_source, str) or not file_source.startswith("s3://"):
# Check if the user has given us a bucket to upload to
# Check if the user has given us a bucket to upload to
if not s3_upload_path:
if not s3_upload_path:
raise InputError(
raise InputError(
"For files not in S3, an S3 upload path must be provided"
"For files not in S3, an S3 upload path must be provided"
)
)


s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
upload_to_s3(self.s3_client, s3_file_path, file_source)
upload_to_s3(self.s3_client, s3_file_path, file_source)
file_source = s3_file_path
file_source = s3_file_path


output_config = None
output_config = None
if s3_output_path:
if s3_output_path:
s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)
output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)


try:
try:
response = call_textract(
response = call_textract(
input_document=file_source,
input_document=file_source,
features=[],
features=[],
queries_config=None, # not supported yet
queries_config=None, # not supported yet
output_config=output_config,
output_config=output_config,
kms_key_id=self.kms_key_id,
kms_key_id=self.kms_key_id,
job_tag=job_tag,
job_tag=job_tag,
notification_channel=None, # not supported yet
notification_channel=None, # not supported yet
client_request_token=client_request_token,
client_request_token=client_request_token,
return_job_id=True,
return_job_id=True,
force_async_api=True,
force_async_api=True,
call_mode=Textract_Call_Mode.FORCE_ASYNC,
call_mode=Textract_Call_Mode.FORCE_ASYNC,
boto3_textract_client=self.textract_client,
boto3_textract_client=self.textract_client,
job_done_polling_interval=1,
job_done_polling_interval=1,
)
)
except Exception as exception:
except Exception as exception:
if exception.__class__.__name__ == "InvalidS3ObjectException":
if exception.__class__.__name__ == "InvalidS3ObjectException":
raise RegionMismatchError(
raise InvalidS3ObjectException(
"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
"Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
)
)
raise exception
raise exception


images = None
images = None
if save_image:
if save_image:
if isinstance(original_file_source, Image.Image):
if isinstance(original_file_source, Image.Image):
images = [original_file_source]
images = [original_file_source]
elif (
elif (
isinstance(original_file_source, list)
isinstance(original_file_source, list)
and len(original_file_source)
and len(original_file_source)
and isinstance(original_file_source[0], Image.Image)
and isinstance(original_file_source[0], Image.Image)
):
):
images = original_file_source
images = original_file_source
else:
else:
images = self._get_document_images_from_path(original_file_source)
images = self._get_document_images_from_path(original_file_source)


return LazyDocument(
return LazyDocument(
response["JobId"],
response["JobId"],
TextractAPI.DETECT_TEXT,
TextractAPI.DETECT_TEXT,
textract_client=self.textract_client,
textract_client=self.textract_client,
images=images,
images=images,
)
)


def analyze_document(
def analyze_document(
self,
self,
file_source,
file_source,
features,
features,
queries: Union[QueriesConfig, List[Query], List[str]] = None,
queries: Union[QueriesConfig, List[Query], List[str]] = None,
s3_output_path: str = "",
save_image: bool = True,
save_image: bool = True,
) -> Document:
) -> Document:
"""
"""
Make a call to the SYNC AnalyzeDocument API, implicitly parses the response and produces a :class:`Document` object.
Make a call to the SYNC AnalyzeDocument API, implicitly parses the response and produces a :class:`Document` object.
This function is ideal for single page PDFs or single images.
This function is ideal for single page PDFs or single images.


:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
:param file_source: Path to a file stored locally, on an S3 bucket or PIL Image
:type file_source: str or PIL.Image, required
:type file_source: str or PIL.Image, required
:param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
:param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
:type features: list, required
:type features: list, required
:param queries: Queries to run on the document
:param queries: Queries to run on the document
:type features: Union[QueriesConfig, List[Query], List[str]]
:type queries: Union[QueriesConfig, List[Query], List[str]]
:param s3_output_path: Prefix to store the output on the S3 bucket (passed as param to Textractor).
:type s3_output_path: str, optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
and necessary only if the customer wants to visualize bounding boxes for their document entities.
and necessary only if the customer wants to visualize bounding boxes for their document entities.
:type save_image: bool
:type save_image: bool


:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
AnalyzeDocument API stored within it.
AnalyzeDocument API stored within it.
:rtype: Document
:rtype: Document
"""
"""
if isinstance(file_source, list) and len(file_source) > 1:
if isinstance(file_source, list) and len(file_source) > 1:
raise IncorrectMethodException(
raise IncorrectMethodException(
"List contains more than 1 image. Call start_document_analysis() instead."
"List contains more than 1 image. Call start_document_analysis() instead."
)
)


elif isinstance(file_source, str):
elif isinstance(file_source, str):
logging.debug("Filepath given.")
logger.debug("Filepath given.")
images = self._get_document_images_from_path(file_source)
if not save_image and file_source.lower().endswith(".pdf"):
if len(images) > 1:
images = []
raise IncorrectMethodException(
else:
"Input contains more than 1 page. Call start_document_analysis() instead."
images = self._get_document_images_from_path(file_source)
)
if len(images) > 1:
file_source = _image_to_byte_array(images[0])
raise IncorrectMethodException(

"Input contains more than 1 page. Call start_document_analysis() instead."
)
file_source = _image_to_byte_array(images[0])
elif isinstance(file_source, Image.Image):
elif isinstance(file_source, Image.Image):
logging.debug("PIL Image given.")
logger.debug("PIL Image given.")
images = [file_source]
images = [file_source]
file_source = _image_to_byte_array(file_source)
file_source = _image_to_byte_array(file_source)


elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
elif isinstance(file_source, list) and isinstance(file_source[0], Image.Image):
logging.debug("List of PIL Image given.")
logger.debug("List of PIL Image given.")
images = deepcopy(file_source)
images = deepcopy(file_source)
file_source = _image_to_byte_array(images[0])
file_source = _image_to_byte_array(images[0])


else:
else:
images = []
images = []
raise InputError("Input file_source format not supported.")
raise InputError("Input file_source format not supported.")


if not s3_output_path:
output_config = None
else:
bucket, prefix = s3_path_to_bucket_and_prefix(s3_output_path)
output_config = OutputConfig(s3_bucket=bucket, s3_prefix=prefix)

if not isinstance(features, list):
if not isinstance(features, list):
features = [features]
features = [features]


if queries and TextractFeatures.QUERIES not in features:
if queries and TextractFeatures.QUERIES not in features:
raise InputError(
raise InputError(
"Queries were given as a parameter but QUERIES is not enabled in the feature set"
"Queries were given as a parameter but QUERIES is not enabled in the feature set"
)
)


if queries and not isinstance(queries, QueriesConfig):
if queries and not isinstance(queries, QueriesConfig):
if not isinstance(queries, List):
if not isinstance(queries, List):
raise InputError(
raise InputError(
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
)
)
if isinstance(queries[0], Query):
if isinstance(queries[0], Query):
queries_config = QueriesConfig(queries)
queries_config = QueriesConfig(queries)
queries = queries_config
queries = queries_config
elif isinstance(queries[0], str):
elif isinstance(queries[0], str):
queries_config = QueriesConfig([Query(query) for query in queries])
queries_config = QueriesConfig([Query(query) for query in queries])
queries = queries_config
queries = queries_config
else:
else:
raise InputError(
raise InputError(
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
)
)


try:
try:
response = call_textract(
response = call_textract(
input_document=file_source,
input_document=file_source,
features=features,
features=features,
queries_config=queries, # not supported yet
queries_config=queries, # not supported yet
output_config=output_config,
output_config=None,
kms_key_id=self.kms_key_id,
kms_key_id=self.kms_key_id,
job_tag="",
job_tag="",
notification_channel=None, # not supported yet
notification_channel=None, # not supported yet
client_request_token="",
client_request_token="",
return_job_id=False,
return_job_id=False,
force_async_api=False,
force_async_api=False,
call_mode=Textract_Call_Mode.FORCE_SYNC,
call_mode=Textract_Call_Mode.FORCE_SYNC,
boto3_textract_client=self.textract_client,
boto3_textract_client=self.textract_client,
job_done_polling_interval=0,
job_done_polling_interval=0,
)
)
except Exception as exception:
except Exception as exception:
if exception.__class__.__name__ == "InvalidS3ObjectException":
if exception.__class__.__name__ == "InvalidS3ObjectException":
raise RegionMismatchError(
raise InvalidS3ObjectException(
"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
"Textract returned InvalidS3ObjectException. Ensure that the s3 path is correct and that both the Textract API and the bucket are in the same region."
)
elif exception.__class__.__name__ == "UnsupportedDocumentException":
raise UnsupportedDocumentException(
"Textract returned an UnsupportedDocumentException, if file_source is a PDF, make sure that it only has one page or use start_document_analysis. If your file_source is an image, make sure that it is not larger than 5MB."
)
)
raise exception
raise exception


document = response_parser.parse(response)
document = response_parser.parse(response)
document.response = response
document.response = response
if save_image:
if save_image:
for page in document.pages:
for page in document.pages:
page.image = images[document.pages.index(page)]
page.image = images[document.pages.index(page)]
return document
return document


def start_document_analysis(
def start_document_analysis(
self,
self,
file_source: Union[str, bytes, Image.Image],
file_source: Union[str, bytes, Image.Image],
features,
features,
s3_output_path: str = "",
s3_output_path: str = "",
s3_upload_path: str = "",
s3_upload_path: str = "",
queries: Union[QueriesConfig, List[Query], List[str]] = None,
queries: Union[QueriesConfig, List[Query], List[str]] = None,
client_request_token: str = "",
client_request_token: str = "",
job_tag: str = "",
job_tag: str = "",
save_image: bool = True,
save_image: bool = True,
) -> LazyDocument:
) -> LazyDocument:
"""
"""
Make a call to the ASYNC StartDocumentAnalysis API, implicitly parses the response and produces a :class:`Document` object.
Make a call to the ASYNC StartDocumentAnalysis API, implicitly parses the response and produces a :class:`Document` object.
This function is ideal for multipage PDFs or an image.
This function is ideal for multipage PDFs or an image.


:param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
:param file_source: Path to a file stored locally, on an S3 bucket or a PIL Image
:type file_source: Union[str, bytes, Image.Image], required
:type file_source: Union[str, bytes, Image.Image], required
:param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
:param features: List of TextractFeatures to be extracted from the Document by the TextractAPI
:type features: list, required
:type features: list, required
:param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
:param s3_output_path: Path to store the output on the S3 bucket (passed as param to Textractor).
:type s3_output_path: str
:type s3_output_path: str
:param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
:param s3_upload_path: If given, will automatically upload the document to the given S3 prefix before calling Textract. Files are uploaded
under a uuid. If not given the data is expected to be already in s3
under a uuid. If not given the data is expected to be already in s3
:type s3_upload_path: str, optional
:type s3_upload_path: str, optional
:param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
:param client_request_token: The idempotent token that's used to identify the start request. If you use the same. token
with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
with multiple StartDocumentTextDetection requests, the same. JobId is returned. Use ClientRequestToken
to prevent the same. job from being accidentally started more than once.
to prevent the same. job from being accidentally started more than once.
:type client_request_token: str, optional
:type client_request_token: str, optional
:param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
:param job_tag: An identifier that you specify that's included in the completion notification published to the Amazon SNS topic.
:type job_tag: str, optional
:type job_tag: str, optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
:param save_image: Flag to indicate if document images are to be stored within the Document object. This is optional
and necessary only if the customer wants to visualize bounding boxes for their document entities.
and necessary only if the customer wants to visualize bounding boxes for their document entities.
:type save_image: bool
:type save_image: bool


:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
:return: Returns a Document object containing all the entities, relationships and metadata extracted by the Textract
StartDocumentAnalysis API stored within it.
StartDocumentAnalysis API stored within it.
:rtype: Document
:rtype: Document
"""
"""


original_file_source = file_source
original_file_source = file_source


if not isinstance(file_source, (str, bytes, Image.Image)):
if not isinstance(file_source, (str, bytes, Image.Image)):
raise InputError(
raise InputError(
f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
f"file_source needs to be of type str, bytes or PIL Image, not {type(file_source)}"
)
)


# If the file is not already in S3
# If the file is not already in S3
if not isinstance(file_source, str) or not file_source.startswith("s3://"):
if not isinstance(file_source, str) or not file_source.startswith("s3://"):
# Check if the user has given us a bucket to upload to
# Check if the user has given us a bucket to upload to
if not s3_upload_path:
if not s3_upload_path:
raise InputError(
raise InputError(
f"For files not in S3, an S3 upload path must be provided"
f"For files not in S3, an S3 upload path must be provided"
)
)


s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
s3_file_path = os.path.join(s3_upload_path, str(uuid.uuid4()))
upload_to_s3(self.s3_client, s3_file_path, file_source)
upload_to_s3(self.s3_client, s3_file_path, file_source)
file_source = s3_file_path
file_source = s3_file_path


output_config = None
output_config = None
if s3_output_path:
if s3_output_path:
s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
s3_bucket, s3_prefix = s3_path_to_bucket_and_prefix(s3_output_path)
output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)
output_config = OutputConfig(s3_bucket=s3_bucket, s3_prefix=s3_prefix)


if not isinstance(features, list):
if not isinstance(features, list):
features = [features]
features = [features]


if queries and TextractFeatures.QUERIES not in features:
if queries and TextractFeatures.QUERIES not in features:
raise InputError(
raise InputError(
"Queries were given as a parameter but QUERIES is not enabled in the feature set"
"Queries were given as a parameter but QUERIES is not enabled in the feature set"
)
)


if queries and not isinstance(queries, QueriesConfig):
if queries and not isinstance(queries, QueriesConfig):
if not isinstance(queries, List):
if not isinstance(queries, List):
raise InputError(
raise InputError(
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
)
)
if isinstance(queries[0], Query):
if isinstance(queries[0], Query):
queries_config = QueriesConfig(queries)
queries_config = QueriesConfig(queries)
queries = queries_config
queries = queries_config
elif isinstance(queries[0], str):
elif isinstance(queries[0], str):
queries_config = QueriesConfig([Query(query) for query in queries])
queries_config = QueriesConfig([Query(query) for query in queries])
queries = queries_config
queries = queries_config
else:
else:
raise InputError(
raise InputError(
f"Queries must be of type QueriesConfig, List[Query] or List[str], not {type(queries)}"
f"Queries must be of type QueriesC
)

try:
response = call_textract(
input_document=file_source,
features=features,
queries_config=queries, # not supported yet
output_config=output_config,
kms_key_id=self.kms_key_id,
job_tag=job_tag,
notification_channel=None, # not supported yet
client_request_token=client_request_token,
return_job_id=True,
force_async_api=True,
call_mode=Textract_Call_Mode.FORCE_ASYNC,
boto3_textract_client=self.textract_client,
job_done_polling_interval=1,
)
except Exception as exception:
if exception.__class__.__name__ == "InvalidS3ObjectException":
raise RegionMismatchError(
"Region passed in the profile_name and S3 bucket do not match. Ensure the regions are the same."
)
raise exception

images = None
if save_image:
if isinstance(original_file_source, Image.Image):
images = [original_file_source]
elif (