IonToParquet
yaml
type: "io.kestra.plugin.serdes.parquet.iontoparquet"
Examples
yaml
id: ion_to_parquet
namespace: company.team
tasks:
- id: download_csv
type: io.kestra.plugin.core.http.Download
description: salaries of data professionals from 2020 to 2023 (source ai-jobs.net)
uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/salaries.csv
- id: avg_salary_by_job_title
type: io.kestra.plugin.jdbc.duckdb.Query
inputFiles:
data.csv: "{{ outputs.download_csv.uri }}"
sql: |
SELECT
job_title,
ROUND(AVG(salary),2) AS avg_salary
FROM read_csv_auto('{{ workingDir }}/data.csv', header=True)
GROUP BY job_title
HAVING COUNT(job_title) > 10
ORDER BY avg_salary DESC;
store: true
- id: result
type: io.kestra.plugin.serdes.parquet.IonToParquet
from: "{{ outputs.avg_salary_by_job_title.uri }}"
schema: |
{
"type": "record",
"name": "Salary",
"namespace": "com.example.salary",
"fields": [
{"name": "job_title", "type": "string"},
{"name": "avg_salary", "type": "double"}
]
}
Properties
from *Requiredstring
compressionCodec string
Default
GZIP
Possible Values
UNCOMPRESSED
SNAPPY
GZIP
ZSTD
dateFormat string
Default
yyyy-MM-dd[XXX]
datetimeFormat string
Default
yyyy-MM-dd'T'HH:mm[:ss][.SSSSSS][XXX]
decimalSeparator string
Default
.
dictionaryPageSize integerstring
Default
1048576
falseValues array
SubType string
Default
["f","false","disabled","0","off","no",""]
inferAllFields booleanstring
Default
false
nullValues array
SubType string
Default
["","#N/A","#N/A N/A","#NA","-1.#IND","-1.#QNAN","-NaN","1.#IND","1.#QNAN","NA","n/a","nan","null"]
numberOfRowsToScan integerstring
Default
100
pageSize integerstring
Default
1048576
parquetVersion string
Default
V2
Possible Values
V1
V2
rowGroupSize integerstring
Default
134217728
schema string
strictSchema booleanstring
Default
false
timeFormat string
Default
HH:mm[:ss][.SSSSSS][XXX]
timeZoneId string
Default
Etc/UTC
trueValues array
SubType string
Default
["t","true","enabled","1","on","yes"]
Outputs
uri string
Format
uri