/*
 * Decompiled with CFR 0.152.
 */
package org.nuxeo.labs.aws.textract;

import com.amazonaws.services.textract.model.AnalyzeDocumentResult;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import org.nuxeo.ecm.automation.core.annotations.Context;
import org.nuxeo.ecm.automation.core.annotations.Operation;
import org.nuxeo.ecm.automation.core.annotations.OperationMethod;
import org.nuxeo.ecm.automation.core.annotations.Param;
import org.nuxeo.ecm.automation.core.util.BlobList;
import org.nuxeo.ecm.core.api.Blob;
import org.nuxeo.ecm.core.api.CoreSession;
import org.nuxeo.ecm.core.api.DocumentModel;
import org.nuxeo.labs.aws.textract.TextractService;
import org.nuxeo.labs.aws.textract.TextractUtils;

@Operation(id="Textract.Analyze", category="Document", label="Textract.Analyze", description="Analyze the file using the synchonous Textract API. (see limitation in this case). Granularity is WORD, LINE. If returnRawJson is true, granularity is ignored and the operation saves the JSON String as returned by Textract. WORD and LINE set the values to a String, with a linefeed as separator. It does not return duplicates. features is a comma separated list of Textract features. If not passed, default is TABLES,FORMS. See AWS documentation for a list of features (as of August 2025: FORMS, LAYOUT, QUERIES, SIGNATURES and TABLES). For multipages, the blob is split in individual pages sent to textract and when asking for rawJson you receie an array, one pbject per page (but each one will state it is page 1). Async. calls are welcome via pull requests.")
public class AnalyzeOp {
    public static final String ID = "Textract.Analyze";
    @Context
    protected CoreSession session;
    @Param(name="blobXPath", required=false)
    protected String blobXPath = "file:content";
    @Param(name="resultXPath", required=true)
    protected String resultXPath;
    @Param(name="features", required=false)
    protected String features = null;
    @Param(name="granularity", widget="Option", values={"WORD", "LINE"}, required=false)
    protected String granularity = "WORD";
    @Param(name="returnRawJson", required=false)
    protected Boolean returnRawJson = false;
    @Param(name="saveDocument", required=false)
    protected Boolean saveDocument = false;
    @Param(name="bucket", required=false, description="Only for unit testing")
    protected String bucket = null;
    @Param(name="bucketPrefix", required=false, description="Only for unit testing")
    protected String bucketPrefix = null;
    @Param(name="region", required=false, description="Only for unit testing")
    protected String region = null;

    @OperationMethod
    public DocumentModel run(DocumentModel doc) {
        Blob blob = (Blob)doc.getPropertyValue(this.blobXPath);
        int pages = 1;
        BlobList blobList = TextractUtils.splitPDFIfMoreThanOnePage(blob);
        if (blobList != null) {
            pages = blobList.size();
        }
        List<String> featuresList = null;
        if (StringUtils.isNotBlank((CharSequence)this.features)) {
            featuresList = Arrays.stream(this.features.split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList());
        }
        TextractService service = null;
        service = StringUtils.isNoneBlank((CharSequence[])new CharSequence[]{this.bucket, this.bucketPrefix, this.region}) ? TextractService.getInstance(this.bucket, this.bucketPrefix, this.region) : TextractService.getInstance();
        Object result = null;
        TextractUtils.Granularity correctGranularity = TextractUtils.Granularity.valueOf(this.granularity);
        if (pages == 1) {
            result = this.returnRawJson.booleanValue() ? service.analyzeGetRawResultJsonString(featuresList, blob) : service.analyzeGetText(correctGranularity, featuresList, blob);
        } else if (this.returnRawJson.booleanValue()) {
            JSONArray finalJson = new JSONArray();
            for (Blob oneBlob : blobList) {
                AnalyzeDocumentResult analyzeResult = service.analyze(featuresList, oneBlob);
                JSONObject obj = new JSONObject((Object)analyzeResult);
                finalJson.put((Object)obj);
            }
            result = finalJson.toString();
            TextractUtils.deleteFilesSilently(blobList);
            blobList = null;
        } else {
            result = "";
            for (Blob oneBlob : blobList) {
                AnalyzeDocumentResult analyzeResult = service.analyze(featuresList, oneBlob);
                String onePageResult = TextractUtils.getAllTextJoined(() -> ((AnalyzeDocumentResult)analyzeResult).getBlocks(), correctGranularity, "\n");
                result = (String)result + "/n" + onePageResult;
            }
            result = TextractUtils.removeDuplicates((String)result, "\n");
        }
        doc.setPropertyValue(this.resultXPath, (Serializable)result);
        if (this.saveDocument.booleanValue()) {
            doc = this.session.saveDocument(doc);
        }
        return doc;
    }
}

