26#include <unordered_map>
27#include <unordered_set>
31#include "iceberg/iceberg_export.h"
32#include "iceberg/result.h"
42 enum class Kind : uint8_t {
48 virtual Kind
kind()
const = 0;
70 explicit FileScanTask(std::shared_ptr<DataFile> data_file,
71 std::vector<std::shared_ptr<DataFile>> delete_files = {},
72 std::shared_ptr<Expression> filter =
nullptr);
75 const std::shared_ptr<DataFile>&
data_file()
const {
return data_file_; }
78 const std::vector<std::shared_ptr<DataFile>>&
delete_files()
const {
83 const std::shared_ptr<Expression>&
residual_filter()
const {
return residual_filter_; }
85 Kind
kind()
const override {
return Kind::kFileScanTask; }
86 int64_t size_bytes()
const override;
87 int32_t files_count()
const override;
88 int64_t estimated_row_count()
const override;
97 Result<ArrowArrayStream> ToArrow(
const std::shared_ptr<FileIO>& io,
98 std::shared_ptr<Schema> projected_schema)
const;
101 std::shared_ptr<DataFile> data_file_;
102 std::vector<std::shared_ptr<DataFile>> delete_files_;
103 std::shared_ptr<Expression> residual_filter_;
106enum class ChangelogOperation : uint8_t {
124 std::shared_ptr<DataFile> data_file,
125 std::vector<std::shared_ptr<DataFile>> delete_files = {},
126 std::shared_ptr<Expression> residual_filter =
nullptr)
127 : change_ordinal_(change_ordinal),
128 commit_snapshot_id_(commit_snapshot_id),
129 data_file_(
std::move(data_file)),
130 delete_files_(
std::move(delete_files)),
131 residual_filter_(
std::move(residual_filter)) {}
133 Kind
kind()
const override {
return Kind::kChangelogScanTask; }
135 int64_t size_bytes()
const override;
136 int32_t files_count()
const override;
137 int64_t estimated_row_count()
const override;
139 virtual ChangelogOperation operation()
const = 0;
148 const std::shared_ptr<Expression>&
residual_filter()
const {
return residual_filter_; }
151 int32_t change_ordinal_;
152 int64_t commit_snapshot_id_;
153 std::shared_ptr<DataFile> data_file_;
154 std::vector<std::shared_ptr<DataFile>> delete_files_;
155 std::shared_ptr<Expression> residual_filter_;
178 using ChangelogScanTask::ChangelogScanTask;
180 ChangelogOperation operation()
const override {
return ChangelogOperation::kInsert; }
183 const std::shared_ptr<DataFile>&
data_file()
const {
return data_file_; }
189 return delete_files_;
208 using ChangelogScanTask::ChangelogScanTask;
210 ChangelogOperation operation()
const override {
return ChangelogOperation::kDelete; }
213 const std::shared_ptr<DataFile>&
data_file()
const {
return data_file_; }
220 return delete_files_;
228 std::optional<int64_t> snapshot_id;
229 std::shared_ptr<Expression> filter;
230 bool ignore_residuals{
false};
231 bool case_sensitive{
true};
232 bool return_column_stats{
false};
233 std::unordered_set<int32_t> columns_to_keep_stats;
234 std::vector<std::string> selected_columns;
235 std::shared_ptr<Schema> projected_schema;
236 std::unordered_map<std::string, std::string> options;
237 bool from_snapshot_id_inclusive{
false};
238 std::optional<int64_t> from_snapshot_id;
239 std::optional<int64_t> to_snapshot_id;
240 std::string branch{};
241 std::optional<int64_t> min_rows_requested;
244 [[nodiscard]] Status Validate()
const;
252 std::is_base_of_v<IncrementalScan<ChangelogScanTask>, T>;
255template <
typename ScanType = DataTableScan>
261 static Result<std::unique_ptr<TableScanBuilder<ScanType>>> Make(
262 std::shared_ptr<TableMetadata> metadata, std::shared_ptr<FileIO> io);
290 TableScanBuilder& IncludeColumnStats(
const std::vector<std::string>& requested_columns);
344 TableScanBuilder& FromSnapshot(int64_t from_snapshot_id,
bool inclusive =
false)
357 TableScanBuilder& FromSnapshot(
const std::string& ref,
bool inclusive =
false)
391 Result<std::unique_ptr<ScanType>> Build();
394 TableScanBuilder(std::shared_ptr<TableMetadata> metadata, std::shared_ptr<FileIO> io);
397 Result<std::reference_wrapper<const std::shared_ptr<Schema>>> ResolveSnapshotSchema();
399 std::shared_ptr<TableMetadata> metadata_;
400 std::shared_ptr<FileIO> io_;
402 std::shared_ptr<Schema> snapshot_schema_;
411 const std::shared_ptr<TableMetadata>& metadata()
const;
414 Result<std::shared_ptr<Snapshot>> snapshot()
const;
417 Result<std::shared_ptr<Schema>> schema()
const;
423 const std::shared_ptr<FileIO>& io()
const;
426 const std::shared_ptr<Expression>& filter()
const;
429 bool is_case_sensitive()
const;
432 TableScan(std::shared_ptr<TableMetadata> metadata, std::shared_ptr<Schema> schema,
435 Result<std::reference_wrapper<const std::shared_ptr<Schema>>> ResolveProjectedSchema()
438 virtual const std::vector<std::string>& ScanColumns()
const;
440 const std::shared_ptr<TableMetadata> metadata_;
441 const std::shared_ptr<Schema> schema_;
442 const std::shared_ptr<FileIO> io_;
444 mutable std::shared_ptr<Schema> projected_schema_;
453 static Result<std::unique_ptr<DataTableScan>> Make(
454 std::shared_ptr<TableMetadata> metadata, std::shared_ptr<Schema> schema,
459 Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles()
const;
462 using TableScan::TableScan;
467template <
typename ScanTaskType>
472 virtual Result<std::vector<std::shared_ptr<ScanTaskType>>> PlanFiles()
const = 0;
475 virtual Result<std::vector<std::shared_ptr<ScanTaskType>>> PlanFiles(
476 std::optional<int64_t> from_snapshot_id_exclusive,
477 int64_t to_snapshot_id_inclusive)
const = 0;
479 using TableScan::TableScan;
482 template <
typename T>
483 friend Result<std::vector<std::shared_ptr<T>>> ResolvePlanFiles(
484 const IncrementalScan<T>& scan);
491 static Result<std::unique_ptr<IncrementalAppendScan>> Make(
492 std::shared_ptr<TableMetadata> metadata, std::shared_ptr<Schema> schema,
497 Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles()
const override;
500 Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles(
501 std::optional<int64_t> from_snapshot_id_exclusive,
502 int64_t to_snapshot_id_inclusive)
const override;
504 using IncrementalScan::IncrementalScan;
512 static Result<std::unique_ptr<IncrementalChangelogScan>> Make(
513 std::shared_ptr<TableMetadata> metadata, std::shared_ptr<Schema> schema,
518 Result<std::vector<std::shared_ptr<ChangelogScanTask>>> PlanFiles()
const override;
521 Result<std::vector<std::shared_ptr<ChangelogScanTask>>> PlanFiles(
522 std::optional<int64_t> from_snapshot_id_exclusive,
523 int64_t to_snapshot_id_inclusive)
const override;
525 using IncrementalScan::IncrementalScan;
528extern template class ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT
529 TableScanBuilder<DataTableScan>;
530extern template class ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT
531 TableScanBuilder<IncrementalAppendScan>;
532extern template class ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT
533 TableScanBuilder<IncrementalChangelogScan>;
A scan task for inserts generated by adding a data file to the table.
Definition table_scan.h:176
const std::shared_ptr< DataFile > & data_file() const
The data file containing the added rows.
Definition table_scan.h:183
const std::vector< std::shared_ptr< DataFile > > & delete_files() const
A list of delete files to apply when reading the data file in this task.
Definition table_scan.h:188
A scan task for reading changelog entries between snapshots.
Definition table_scan.h:114
int32_t change_ordinal() const
The position of this change in the changelog order (0-based).
Definition table_scan.h:142
int64_t commit_snapshot_id() const
The snapshot ID that committed this change.
Definition table_scan.h:145
ChangelogScanTask(int32_t change_ordinal, int64_t commit_snapshot_id, std::shared_ptr< DataFile > data_file, std::vector< std::shared_ptr< DataFile > > delete_files={}, std::shared_ptr< Expression > residual_filter=nullptr)
Construct an AddedRowsScanTask.
Definition table_scan.h:123
Kind kind() const override
The kind of scan task.
Definition table_scan.h:133
const std::shared_ptr< Expression > & residual_filter() const
Residual filter to apply after reading.
Definition table_scan.h:148
A scan that reads data files and applies delete files to filter rows.
Definition table_scan.h:448
A scan task for deletes generated by removing a data file from the table.
Definition table_scan.h:206
const std::vector< std::shared_ptr< DataFile > > & existing_deletes() const
A list of previously added delete files to apply when reading the data file in this task.
Definition table_scan.h:219
const std::shared_ptr< DataFile > & data_file() const
The data file that was deleted.
Definition table_scan.h:213
Base class for collecting errors in the builder pattern.
Definition error_collector.h:93
Task representing a data file and its corresponding delete files.
Definition table_scan.h:63
Kind kind() const override
The kind of scan task.
Definition table_scan.h:85
const std::shared_ptr< DataFile > & data_file() const
The data file that should be read by this scan task.
Definition table_scan.h:75
const std::shared_ptr< Expression > & residual_filter() const
Residual filter to apply after reading.
Definition table_scan.h:83
const std::vector< std::shared_ptr< DataFile > > & delete_files() const
Delete files that apply to this data file.
Definition table_scan.h:78
A scan that reads data files added between snapshots (incremental appends).
Definition table_scan.h:488
A scan that reads changelog entries between snapshots.
Definition table_scan.h:509
A base template class for incremental scans that read changes between snapshots, and return scan task...
Definition table_scan.h:468
An abstract scan task.
Definition table_scan.h:40
virtual int32_t files_count() const =0
The number of files that should be read by this scan task.
virtual int64_t estimated_row_count() const =0
The number of rows that should be read by this scan task.
virtual int64_t size_bytes() const =0
The number of bytes that should be read by this scan task.
virtual Kind kind() const =0
The kind of scan task.
Builder class for creating TableScan instances.
Definition table_scan.h:256
Represents a configured scan operation on a table.
Definition table_scan.h:406
Definition table_scan.h:251
Definition table_scan.h:227