Skip to content

feat: Support Parquet writer options #1123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/datafusion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
SessionContext,
SQLOptions,
)
from .dataframe import DataFrame
from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
from .expr import (
Expr,
WindowFrame,
Expand All @@ -66,6 +66,8 @@
"ExecutionPlan",
"Expr",
"LogicalPlan",
"ParquetColumnOptions",
"ParquetWriterOptions",
"RecordBatch",
"RecordBatchStream",
"RuntimeEnvBuilder",
Expand Down
221 changes: 221 additions & 0 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@

from enum import Enum

from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal
from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
from datafusion.expr import Expr, SortExpr, sort_or_default


Expand Down Expand Up @@ -114,6 +116,173 @@ def get_default_level(self) -> Optional[int]:
return None


class ParquetWriterOptions:
"""Advanced parquet writer options.

Allows settings the writer options that apply to the entire file. Some options can
also be set on a column by column basis, with the field `column_specific_options`
(see `ParquetColumnOptions`).

Attributes:
data_pagesize_limit: Sets best effort maximum size of data page in bytes.
write_batch_size: Sets write_batch_size in bytes.
writer_version: Sets parquet writer version. Valid values are `1.0` and
`2.0`.
skip_arrow_metadata: Skip encoding the embedded arrow metadata in the
KV_meta.
compression: Compression type to use. Default is "zstd(3)".
Available compression types are
- "uncompressed": No compression.
- "snappy": Snappy compression.
- "gzip(n)": Gzip compression with level n.
- "brotli(n)": Brotli compression with level n.
- "lz4": LZ4 compression.
- "lz4_raw": LZ4_RAW compression.
- "zstd(n)": Zstandard compression with level n.
dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses
the default parquet writer setting.
dictionary_page_size_limit: Sets best effort maximum dictionary page size,
in bytes.
statistics_enabled: Sets if statistics are enabled for any column Valid
values are `none`, `chunk`, and `page`. If None, uses the default
parquet writer setting.
max_row_group_size: Target maximum number of rows in each row group
(defaults to 1M rows). Writing larger row groups requires more memory to
write, but can get better compression and be faster to read.
created_by: Sets "created by" property.
column_index_truncate_length: Sets column index truncate length.
statistics_truncate_length: Sets statistics truncate length. If None, uses
the default parquet writer setting.
data_page_row_count_limit: Sets best effort maximum number of rows in a data
page.
encoding: Sets default encoding for any column. Valid values are `plain`,
`plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
`delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
`byte_stream_split`. If None, uses the default parquet writer setting.
bloom_filter_on_write: Write bloom filters for all columns when creating
parquet files.
bloom_filter_fpp: Sets bloom filter false positive probability. If None,
uses the default parquet writer setting
bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses
the default parquet writer setting.
allow_single_file_parallelism: Controls whether DataFusion will attempt to
speed up writing parquet files by serializing them in parallel. Each
column in each row group in each output file are serialized in parallel
leveraging a maximum possible core count of n_files * n_row_groups *
n_columns.
maximum_parallel_row_group_writers: By default parallel parquet writer is
tuned for minimum memory usage in a streaming execution plan. You may
see a performance benefit when writing large parquet files by increasing
`maximum_parallel_row_group_writers` and
`maximum_buffered_record_batches_per_stream` if your system has idle
cores and can tolerate additional memory usage. Boosting these values is
likely worthwhile when writing out already in-memory data, such as from
a cached data frame.
maximum_buffered_record_batches_per_stream: See
`maximum_parallel_row_group_writers`.
column_specific_options: Overrides options for specific columns. If a column
is not a part of this dictionary, it will use the parameters provided here.
"""

def __init__(
self,
data_pagesize_limit: int = 1024 * 1024,
write_batch_size: int = 1024,
writer_version: str = "1.0",
skip_arrow_metadata: bool = False,
compression: Optional[str] = "zstd(3)",
dictionary_enabled: Optional[bool] = True,
dictionary_page_size_limit: int = 1024 * 1024,
statistics_enabled: Optional[str] = "page",
max_row_group_size: int = 1024 * 1024,
created_by: str = "datafusion-python",
column_index_truncate_length: Optional[int] = 64,
statistics_truncate_length: Optional[int] = None,
data_page_row_count_limit: int = 20_000,
encoding: Optional[str] = None,
bloom_filter_on_write: bool = False,
bloom_filter_fpp: Optional[float] = None,
bloom_filter_ndv: Optional[int] = None,
allow_single_file_parallelism: bool = True,
maximum_parallel_row_group_writers: int = 1,
maximum_buffered_record_batches_per_stream: int = 2,
column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None,
) -> None:
"""Initialize the ParquetWriterOptions."""
self.data_pagesize_limit = data_pagesize_limit
self.write_batch_size = write_batch_size
self.writer_version = writer_version
self.skip_arrow_metadata = skip_arrow_metadata
self.compression = compression
self.dictionary_enabled = dictionary_enabled
self.dictionary_page_size_limit = dictionary_page_size_limit
self.statistics_enabled = statistics_enabled
self.max_row_group_size = max_row_group_size
self.created_by = created_by
self.column_index_truncate_length = column_index_truncate_length
self.statistics_truncate_length = statistics_truncate_length
self.data_page_row_count_limit = data_page_row_count_limit
self.encoding = encoding
self.bloom_filter_on_write = bloom_filter_on_write
self.bloom_filter_fpp = bloom_filter_fpp
self.bloom_filter_ndv = bloom_filter_ndv
self.allow_single_file_parallelism = allow_single_file_parallelism
self.maximum_parallel_row_group_writers = maximum_parallel_row_group_writers
self.maximum_buffered_record_batches_per_stream = (
maximum_buffered_record_batches_per_stream
)
self.column_specific_options = column_specific_options


class ParquetColumnOptions:
"""Parquet options for individual columns.

Contains the available options that can be applied for an individual Parquet column,
replacing the global options in `ParquetWriterOptions`.

Attributes:
encoding: Sets encoding for the column path. Valid values are: `plain`,
`plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`,
`delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and
`byte_stream_split`. These values are not case-sensitive. If `None`, uses
the default parquet options
dictionary_enabled: Sets if dictionary encoding is enabled for the column path.
If `None`, uses the default parquet options
compression: Sets default parquet compression codec for the column path. Valid
values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`,
`lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If
`None`, uses the default parquet options.
statistics_enabled: Sets if statistics are enabled for the column Valid values
are: `none`, `chunk`, and `page` These values are not case sensitive. If
`None`, uses the default parquet options.
bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If
`None`, uses the default parquet options.
bloom_filter_fpp: Sets bloom filter false positive probability for the column
path. If `None`, uses the default parquet options.
bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses
the default parquet options.
"""

def __init__(
self,
encoding: Optional[str] = None,
dictionary_enabled: Optional[bool] = None,
compression: Optional[str] = None,
statistics_enabled: Optional[str] = None,
bloom_filter_enabled: Optional[bool] = None,
bloom_filter_fpp: Optional[float] = None,
bloom_filter_ndv: Optional[int] = None,
) -> None:
"""Initialize the ParquetColumnOptions."""
self.encoding = encoding
self.dictionary_enabled = dictionary_enabled
self.compression = compression
self.statistics_enabled = statistics_enabled
self.bloom_filter_enabled = bloom_filter_enabled
self.bloom_filter_fpp = bloom_filter_fpp
self.bloom_filter_ndv = bloom_filter_ndv


class DataFrame:
"""Two dimensional table representation of data.

Expand Down Expand Up @@ -737,6 +906,58 @@ def write_parquet(

self.df.write_parquet(str(path), compression.value, compression_level)

def write_parquet_options(
self, path: str | pathlib.Path, options: ParquetWriterOptions
) -> None:
"""Execute the :py:class:`DataFrame` and write the results to a Parquet file.

Allows advanced writer options to be set with `ParquetWriterOptions`.

Args:
path: Path of the Parquet file to write.
options: Sets the writer parquet options (see `ParquetWriterOptions`).
"""
options_internal = ParquetWriterOptionsInternal(
options.data_pagesize_limit,
options.write_batch_size,
options.writer_version,
options.skip_arrow_metadata,
options.compression,
options.dictionary_enabled,
options.dictionary_page_size_limit,
options.statistics_enabled,
options.max_row_group_size,
options.created_by,
options.column_index_truncate_length,
options.statistics_truncate_length,
options.data_page_row_count_limit,
options.encoding,
options.bloom_filter_on_write,
options.bloom_filter_fpp,
options.bloom_filter_ndv,
options.allow_single_file_parallelism,
options.maximum_parallel_row_group_writers,
options.maximum_buffered_record_batches_per_stream,
)

column_specific_options_internal = {}
for column, opts in (options.column_specific_options or {}).items():
column_specific_options_internal[column] = ParquetColumnOptionsInternal(
bloom_filter_enabled=opts.bloom_filter_enabled,
encoding=opts.encoding,
dictionary_enabled=opts.dictionary_enabled,
compression=opts.compression,
statistics_enabled=opts.statistics_enabled,
bloom_filter_fpp=opts.bloom_filter_fpp,
bloom_filter_ndv=opts.bloom_filter_ndv,
)

self.df.write_parquet_options(
str(path),
options_internal,
column_specific_options_internal,
)

def write_json(self, path: str | pathlib.Path) -> None:
"""Execute the :py:class:`DataFrame` and write the results to a JSON file.

Expand Down
Loading