Parse ​Parse

yaml
type: "io.kestra.plugin.tika.parse"
yaml
id: tika_parse
namespace: company.team

inputs:
  - id: file
    type: FILE

tasks:
  - id: parse
    type: io.kestra.plugin.tika.Parse
    from: '{{ inputs.file }}'
    extractEmbedded: true
    store: false

yaml
id: tika_parse
namespace: company.team

inputs:
  - id: file
    type: FILE

tasks:
  - id: parse
    type: io.kestra.plugin.tika.Parse
    from: '{{ inputs.file }}'
    ocrOptions:
      strategy: OCR_AND_TEXT_EXTRACTION
    store: true

yaml
id: parse-image-metadata-using-apache-tika
namespace: company.team

tasks:
  - id: get_image
    type: io.kestra.plugin.core.http.Download
    uri: https://kestra.io/blogs/2023-05-31-beginner-guide-kestra.jpg

  - id: tika
    type: io.kestra.plugin.tika.Parse
    from: "{{ outputs.get_image.uri }}"
    store: false
    contentType: TEXT
    ocrOptions:
      strategy: OCR_AND_TEXT_EXTRACTION

yaml
id: parse-pdf
namespace: company.team

tasks:
  - id: download_pdf
    type: io.kestra.plugin.core.http.Download
    uri: https://huggingface.co/datasets/kestra/datasets/resolve/main/pdf/app_store.pdf

  - id: parse_text
    type: io.kestra.plugin.tika.Parse
    from: "{{ outputs.download_pdf.uri }}"
    contentType: TEXT
    store: false

  - id: log_extracted_text
    type: io.kestra.plugin.core.log.Log
    message: "{{ outputs.parse_text.result.content }}"
Properties
Default XHTML
Possible Values
TEXTXHTMLXHTML_NO_HEADER
Default false
Default { "strategy": "NO_OCR" }
Default true
Format uri
Default NO_OCR
Possible Values
AUTONO_OCROCR_ONLYOCR_AND_TEXT_EXTRACTION
SubType string