hats.catalog.dataset

hats.catalog.dataset#

Submodules#

Classes#

Dataset

A base HATS dataset that contains a properties file and the data contained in parquet files

Package Contents#

class Dataset(catalog_info: hats.catalog.dataset.table_properties.TableProperties, catalog_path: str | pathlib.Path | upath.UPath | None = None, schema: pyarrow.Schema | None = None, snapshot: hats.catalog.catalog_snapshot.CatalogSnapshot | None = None, generate_snapshot: bool = False)[source]#

A base HATS dataset that contains a properties file and the data contained in parquet files

catalog_info#

catalog_name#

catalog_path = None#

catalog_base_dir = None#

schema = None#

snapshot = None#

property original_schema: pyarrow.Schema | None#: The original on-disk schema, before any column selection.

property on_disk: bool#: Is the catalog stored on disk?

property unmodified: bool#: Has the catalog been modified from its original on disk state?

aggregate_column_statistics(exclude_hats_columns: bool = True, exclude_columns: list[str] = None, include_columns: list[str] = None)[source]#

Read footer statistics in parquet metadata, and report on global min/max values.

Parameters:

exclude_hats_columnsbool: exclude HATS spatial and partitioning fields from the statistics. Defaults to True.
exclude_columnslist[str]: additional columns to exclude from the statistics.
include_columnslist[str]: if specified, only return statistics for the column names provided. Defaults to None, and returns all non-hats columns.

Returns:

Dataframe: aggregated statistics.

per_pixel_statistics(*, exclude_hats_columns: bool = True, exclude_columns: list[str] | None = None, include_columns: list[str] | None = None, only_numeric_columns: bool = False, include_stats: list[str] | None = None, multi_index=False, per_row_group: bool = False)[source]#: Read footer statistics in parquet metadata, and report on statistics about each pixel partition.

per_partition_statistics(*, exclude_hats_columns: bool = True, exclude_columns: list[str] = None, include_columns: list[str] = None, only_numeric_columns: bool = False, include_stats: list[str] = None, multi_index=False, per_row_group: bool = False)[source]#

Read footer statistics in parquet metadata, and report on statistics about each pixel partition.

Parameters:

exclude_hats_columnsbool: exclude HATS spatial and partitioning fields from the statistics. Defaults to True.
exclude_columnslist[str]: additional columns to exclude from the statistics.
include_columnslist[str]: if specified, only return statistics for the column names provided. Defaults to None, and returns all non-hats columns.
include_statslist[str]: if specified, only return the kinds of values from list (min_value, max_value, null_count, row_count). Defaults to None, and returns all values.
multi_indexbool: should the returned frame be created with a multi-index, first on pixel, then on column name? Default is False, and instead indexes on pixel, with separate columns per-data-column and stat value combination. (Default value = False)

Returns:

Dataframe: all statistics.