29#include <unordered_map>
32#include "iceberg/expression/literal.h"
33#include "iceberg/iceberg_export.h"
35#include "iceberg/result.h"
38#include "iceberg/util/partition_value_util.h"
48 int64_t apply_sequence_number;
51 mutable std::unordered_map<int32_t, Literal> lower_bounds;
52 mutable std::unordered_map<int32_t, Literal> upper_bounds;
53 mutable bool bounds_converted =
false;
57 wrapped(std::move(entry)),
62 return !wrapped.
data_file->lower_bounds.empty() &&
67 Result<std::optional<std::reference_wrapper<const Literal>>>
LowerBound(
69 ICEBERG_RETURN_UNEXPECTED(ConvertBoundsIfNeeded());
70 auto it = lower_bounds.find(
id);
71 return it != lower_bounds.cend() ? std::make_optional(std::cref(it->second))
76 Result<std::optional<std::reference_wrapper<const Literal>>>
UpperBound(
78 ICEBERG_RETURN_UNEXPECTED(ConvertBoundsIfNeeded());
79 auto it = upper_bounds.find(
id);
80 return it != upper_bounds.cend() ? std::make_optional(std::cref(it->second))
86 Status ConvertBoundsIfNeeded()
const;
92 if (data_lower > delete_upper) {
95 if (delete_lower > data_upper) {
102inline bool AllNull(
const std::map<int32_t, int64_t>& null_counts,
103 const std::map<int32_t, int64_t>& value_counts, int32_t field_id,
109 auto null_it = null_counts.find(field_id);
110 auto value_it = value_counts.find(field_id);
111 if (null_it == null_counts.cend() || value_it == value_counts.cend()) {
115 return null_it->second == value_it->second;
119inline bool AllNonNull(
const std::map<int32_t, int64_t>& null_counts, int32_t field_id,
125 auto it = null_counts.find(field_id);
126 if (it == null_counts.cend()) {
130 return it->second <= 0;
134inline bool ContainsNull(
const std::map<int32_t, int64_t>& null_counts, int32_t field_id,
140 auto it = null_counts.find(field_id);
141 if (it == null_counts.cend()) {
145 return it->second > 0;
149ICEBERG_EXPORT Result<bool> CanContainEqDeletesForFile(
150 const DataFile& data_file,
const EqualityDeleteFile& delete_file);
165 std::vector<std::shared_ptr<DataFile>> Filter(int64_t seq);
168 std::vector<std::shared_ptr<DataFile>> ReferencedDeleteFiles();
171 bool empty()
const {
return files_.empty(); }
174 void IndexIfNeeded();
176 std::vector<ManifestEntry> files_;
177 std::vector<int64_t> seqs_;
178 bool indexed_ =
false;
197 Result<std::vector<std::shared_ptr<DataFile>>> Filter(int64_t seq,
201 std::vector<std::shared_ptr<DataFile>> ReferencedDeleteFiles();
204 bool empty()
const {
return files_.empty(); }
207 void IndexIfNeeded();
210 std::vector<EqualityDeleteFile> files_;
211 std::vector<int64_t> seqs_;
212 bool indexed_ =
false;
243 bool has_equality_deletes()
const;
246 bool has_position_deletes()
const;
250 std::vector<std::shared_ptr<DataFile>> ReferencedDeleteFiles()
const;
256 Result<std::vector<std::shared_ptr<DataFile>>> ForEntry(
265 Result<std::vector<std::shared_ptr<DataFile>>> ForDataFile(int64_t sequence_number,
275 static Result<Builder> BuilderFor(
276 std::shared_ptr<FileIO> io, std::shared_ptr<Schema> schema,
277 std::unordered_map<int32_t, std::shared_ptr<PartitionSpec>> specs_by_id,
278 std::vector<ManifestFile> delete_manifests);
281 friend class Builder;
285 std::unique_ptr<internal::EqualityDeletes> global_deletes,
286 std::unique_ptr<
PartitionMap<std::unique_ptr<internal::EqualityDeletes>>>
287 eq_deletes_by_partition,
288 std::unique_ptr<
PartitionMap<std::unique_ptr<internal::PositionDeletes>>>
289 pos_deletes_by_partition,
291 std::unordered_map<std::string, std::unique_ptr<internal::PositionDeletes>>>
293 std::unique_ptr<std::unordered_map<std::string, ManifestEntry>> dv_by_path);
296 Result<std::vector<std::shared_ptr<DataFile>>> FindGlobalDeletes(
297 int64_t seq,
const DataFile& data_file)
const;
298 Result<std::vector<std::shared_ptr<DataFile>>> FindEqPartitionDeletes(
299 int64_t seq,
const DataFile& data_file)
const;
300 Result<std::vector<std::shared_ptr<DataFile>>> FindPosPartitionDeletes(
301 int64_t seq,
const DataFile& data_file)
const;
302 Result<std::vector<std::shared_ptr<DataFile>>> FindPathDeletes(
303 int64_t seq,
const DataFile& data_file)
const;
304 Result<std::shared_ptr<DataFile>> FindDV(int64_t seq,
const DataFile& data_file)
const;
307 std::unique_ptr<internal::EqualityDeletes> global_deletes_;
308 std::unique_ptr<PartitionMap<std::unique_ptr<internal::EqualityDeletes>>>
309 eq_deletes_by_partition_;
310 std::unique_ptr<PartitionMap<std::unique_ptr<internal::PositionDeletes>>>
311 pos_deletes_by_partition_;
313 std::unordered_map<std::string, std::unique_ptr<internal::PositionDeletes>>>
314 pos_deletes_by_path_;
315 std::unique_ptr<std::unordered_map<std::string, ManifestEntry>> dv_by_path_;
317 bool has_eq_deletes_ =
false;
318 bool has_pos_deletes_ =
false;
319 bool is_empty_ =
true;
325 Builder(std::shared_ptr<FileIO> io, std::shared_ptr<Schema> schema,
326 std::unordered_map<int32_t, std::shared_ptr<PartitionSpec>> specs_by_id,
327 std::vector<ManifestFile> delete_manifests);
331 Builder(Builder&&) noexcept;
332 Builder& operator=(Builder&&) noexcept;
333 Builder(const Builder&) = delete;
334 Builder& operator=(const Builder&) = delete;
339 Builder& AfterSequenceNumber(int64_t seq);
354 Builder& CaseSensitive(
bool case_sensitive);
357 Builder& IgnoreResiduals();
371 Status AddPositionDelete(
372 std::unordered_map<
std::
string,
std::unique_ptr<internal::PositionDeletes>>&
374 PartitionMap<
std::unique_ptr<internal::PositionDeletes>>& deletes_by_partition,
378 Status AddEqualityDelete(
379 internal::EqualityDeletes& global_deletes,
380 PartitionMap<
std::unique_ptr<internal::EqualityDeletes>>& deletes_by_partition,
387 int64_t min_sequence_number_ = 0;
391 bool case_sensitive_ = true;
392 bool ignore_residuals_ = false;
An index of delete files by sequence number.
Definition delete_file_index.h:228
Builder(std::shared_ptr< FileIO > io, std::shared_ptr< Schema > schema, std::unordered_map< int32_t, std::shared_ptr< PartitionSpec > > specs_by_id, std::vector< ManifestFile > delete_manifests)
Construct a builder from manifest files.
Base class for collecting errors in the builder pattern.
Definition error_collector.h:93
Represents a boolean expression tree.
Definition expression.h:37
Pluggable module for reading, writing, and deleting files.
Definition file_io.h:115
Literal is a literal value that is associated with a primitive type.
Definition literal.h:39
A map that uses a pair of spec ID and partition tuple as keys.
Definition partition_value_util.h:115
A set that uses a pair of spec ID and partition tuple as elements.
Definition partition_value_util.h:204
A partition spec for a Table.
Definition partition_spec.h:47
A schema for a Table.
Definition schema.h:49
A group of equality delete files sorted by apply sequence number.
Definition delete_file_index.h:185
bool empty() const
Check if this group is empty.
Definition delete_file_index.h:204
A group of position delete files sorted by the sequence number they apply to.
Definition delete_file_index.h:156
bool empty() const
Check if this group is empty.
Definition delete_file_index.h:171
bool AllNull(const std::map< int32_t, int64_t > &null_counts, const std::map< int32_t, int64_t > &value_counts, int32_t field_id, bool is_required)
Check if a value count map indicates all values are null.
Definition delete_file_index.h:102
bool RangesOverlap(const Literal &data_lower, const Literal &data_upper, const Literal &delete_lower, const Literal &delete_upper)
Check if two ranges overlap.
Definition delete_file_index.h:90
bool ContainsNull(const std::map< int32_t, int64_t > &null_counts, int32_t field_id, bool is_required)
Check if the column contains any null values.
Definition delete_file_index.h:134
bool AllNonNull(const std::map< int32_t, int64_t > &null_counts, int32_t field_id, bool is_required)
Check if all values are non-null.
Definition delete_file_index.h:119
DataFile carries data file path, partition tuple, metrics, ...
Definition manifest_entry.h:62
A manifest is an immutable Avro file that lists data files or delete files, along with each file's pa...
Definition manifest_entry.h:307
std::optional< int64_t > sequence_number
Definition manifest_entry.h:328
std::shared_ptr< DataFile > data_file
Definition manifest_entry.h:344
Entry in a manifest list.
Definition manifest_list.h:85
Wrapper for equality delete files that caches converted bounds.
Definition delete_file_index.h:45
Result< std::optional< std::reference_wrapper< const Literal > > > UpperBound(int32_t id) const
Get the upper bound for a field ID.
Definition delete_file_index.h:76
Result< std::optional< std::reference_wrapper< const Literal > > > LowerBound(int32_t id) const
Get the lower bound for a field ID.
Definition delete_file_index.h:67
bool HasLowerAndUpperBounds() const
Check if this delete file has both lower and upper bounds.
Definition delete_file_index.h:61