iceberg-cpp
Loading...
Searching...
No Matches
delete_file_index.h
Go to the documentation of this file.
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#pragma once
21
24
25#include <functional>
26#include <map>
27#include <memory>
28#include <optional>
29#include <unordered_map>
30#include <vector>
31
32#include "iceberg/expression/literal.h"
33#include "iceberg/iceberg_export.h"
35#include "iceberg/result.h"
36#include "iceberg/type_fwd.h"
38#include "iceberg/util/partition_value_util.h"
39
40namespace iceberg {
41
42namespace internal {
43
45struct ICEBERG_EXPORT EqualityDeleteFile {
46 const Schema* schema;
47 ManifestEntry wrapped;
48 int64_t apply_sequence_number; // = data_sequence_number - 1
49
50 // Lazily converted bounds for pruning
51 mutable std::unordered_map<int32_t, Literal> lower_bounds;
52 mutable std::unordered_map<int32_t, Literal> upper_bounds;
53 mutable bool bounds_converted = false;
54
55 EqualityDeleteFile(const Schema* schema, ManifestEntry&& entry)
56 : schema(schema),
57 wrapped(std::move(entry)),
58 apply_sequence_number(wrapped.sequence_number.value() - 1) {}
59
62 return !wrapped.data_file->lower_bounds.empty() &&
63 !wrapped.data_file->upper_bounds.empty();
64 }
65
67 Result<std::optional<std::reference_wrapper<const Literal>>> LowerBound(
68 int32_t id) const {
69 ICEBERG_RETURN_UNEXPECTED(ConvertBoundsIfNeeded());
70 auto it = lower_bounds.find(id);
71 return it != lower_bounds.cend() ? std::make_optional(std::cref(it->second))
72 : std::nullopt;
73 }
74
76 Result<std::optional<std::reference_wrapper<const Literal>>> UpperBound(
77 int32_t id) const {
78 ICEBERG_RETURN_UNEXPECTED(ConvertBoundsIfNeeded());
79 auto it = upper_bounds.find(id);
80 return it != upper_bounds.cend() ? std::make_optional(std::cref(it->second))
81 : std::nullopt;
82 }
83
84 private:
86 Status ConvertBoundsIfNeeded() const;
87};
88
90inline bool RangesOverlap(const Literal& data_lower, const Literal& data_upper,
91 const Literal& delete_lower, const Literal& delete_upper) {
92 if (data_lower > delete_upper) {
93 return false;
94 }
95 if (delete_lower > data_upper) {
96 return false;
97 }
98 return true;
99}
100
102inline bool AllNull(const std::map<int32_t, int64_t>& null_counts,
103 const std::map<int32_t, int64_t>& value_counts, int32_t field_id,
104 bool is_required) {
105 if (is_required) {
106 return false;
107 }
108
109 auto null_it = null_counts.find(field_id);
110 auto value_it = value_counts.find(field_id);
111 if (null_it == null_counts.cend() || value_it == value_counts.cend()) {
112 return false;
113 }
114
115 return null_it->second == value_it->second;
116}
117
119inline bool AllNonNull(const std::map<int32_t, int64_t>& null_counts, int32_t field_id,
120 bool is_required) {
121 if (is_required) {
122 return true;
123 }
124
125 auto it = null_counts.find(field_id);
126 if (it == null_counts.cend()) {
127 return false;
128 }
129
130 return it->second <= 0;
131}
132
134inline bool ContainsNull(const std::map<int32_t, int64_t>& null_counts, int32_t field_id,
135 bool is_required) {
136 if (is_required) {
137 return false;
138 }
139
140 auto it = null_counts.find(field_id);
141 if (it == null_counts.cend()) {
142 return true; // Unknown, assume may contain null
143 }
144
145 return it->second > 0;
146}
147
149ICEBERG_EXPORT Result<bool> CanContainEqDeletesForFile(
150 const DataFile& data_file, const EqualityDeleteFile& delete_file);
151
156class ICEBERG_EXPORT PositionDeletes {
157 public:
158 PositionDeletes() = default;
159
161 [[nodiscard]] Status Add(ManifestEntry&& entry);
162
165 std::vector<std::shared_ptr<DataFile>> Filter(int64_t seq);
166
168 std::vector<std::shared_ptr<DataFile>> ReferencedDeleteFiles();
169
171 bool empty() const { return files_.empty(); }
172
173 private:
174 void IndexIfNeeded();
175
176 std::vector<ManifestEntry> files_;
177 std::vector<int64_t> seqs_;
178 bool indexed_ = false;
179};
180
185class ICEBERG_EXPORT EqualityDeletes {
186 public:
187 explicit EqualityDeletes(const Schema& schema) : schema_(schema) {}
188
190 [[nodiscard]] Status Add(ManifestEntry&& entry);
191
197 Result<std::vector<std::shared_ptr<DataFile>>> Filter(int64_t seq,
198 const DataFile& data_file);
199
201 std::vector<std::shared_ptr<DataFile>> ReferencedDeleteFiles();
202
204 bool empty() const { return files_.empty(); }
205
206 private:
207 void IndexIfNeeded();
208
209 const Schema& schema_;
210 std::vector<EqualityDeleteFile> files_;
211 std::vector<int64_t> seqs_;
212 bool indexed_ = false;
213};
214
215} // namespace internal
216
228class ICEBERG_EXPORT DeleteFileIndex {
229 public:
230 class Builder;
231
233
235 DeleteFileIndex& operator=(DeleteFileIndex&&) noexcept;
236 DeleteFileIndex(const DeleteFileIndex&) = delete;
237 DeleteFileIndex& operator=(const DeleteFileIndex&) = delete;
238
240 bool empty() const;
241
243 bool has_equality_deletes() const;
244
246 bool has_position_deletes() const;
247
250 std::vector<std::shared_ptr<DataFile>> ReferencedDeleteFiles() const;
251
256 Result<std::vector<std::shared_ptr<DataFile>>> ForEntry(
257 const ManifestEntry& entry) const;
258
265 Result<std::vector<std::shared_ptr<DataFile>>> ForDataFile(int64_t sequence_number,
266 const DataFile& file) const;
267
275 static Result<Builder> BuilderFor(
276 std::shared_ptr<FileIO> io, std::shared_ptr<Schema> schema,
277 std::unordered_map<int32_t, std::shared_ptr<PartitionSpec>> specs_by_id,
278 std::vector<ManifestFile> delete_manifests);
279
280 private:
281 friend class Builder;
282
283 // Private constructor used by Builder
285 std::unique_ptr<internal::EqualityDeletes> global_deletes,
286 std::unique_ptr<PartitionMap<std::unique_ptr<internal::EqualityDeletes>>>
287 eq_deletes_by_partition,
288 std::unique_ptr<PartitionMap<std::unique_ptr<internal::PositionDeletes>>>
289 pos_deletes_by_partition,
290 std::unique_ptr<
291 std::unordered_map<std::string, std::unique_ptr<internal::PositionDeletes>>>
292 pos_deletes_by_path,
293 std::unique_ptr<std::unordered_map<std::string, ManifestEntry>> dv_by_path);
294
295 // Helper methods for finding delete files
296 Result<std::vector<std::shared_ptr<DataFile>>> FindGlobalDeletes(
297 int64_t seq, const DataFile& data_file) const;
298 Result<std::vector<std::shared_ptr<DataFile>>> FindEqPartitionDeletes(
299 int64_t seq, const DataFile& data_file) const;
300 Result<std::vector<std::shared_ptr<DataFile>>> FindPosPartitionDeletes(
301 int64_t seq, const DataFile& data_file) const;
302 Result<std::vector<std::shared_ptr<DataFile>>> FindPathDeletes(
303 int64_t seq, const DataFile& data_file) const;
304 Result<std::shared_ptr<DataFile>> FindDV(int64_t seq, const DataFile& data_file) const;
305
306 // Index data structures
307 std::unique_ptr<internal::EqualityDeletes> global_deletes_;
308 std::unique_ptr<PartitionMap<std::unique_ptr<internal::EqualityDeletes>>>
309 eq_deletes_by_partition_;
310 std::unique_ptr<PartitionMap<std::unique_ptr<internal::PositionDeletes>>>
311 pos_deletes_by_partition_;
312 std::unique_ptr<
313 std::unordered_map<std::string, std::unique_ptr<internal::PositionDeletes>>>
314 pos_deletes_by_path_;
315 std::unique_ptr<std::unordered_map<std::string, ManifestEntry>> dv_by_path_;
316
317 bool has_eq_deletes_ = false;
318 bool has_pos_deletes_ = false;
319 bool is_empty_ = true;
320};
321
322class ICEBERG_EXPORT DeleteFileIndex::Builder : public ErrorCollector {
323 public:
325 Builder(std::shared_ptr<FileIO> io, std::shared_ptr<Schema> schema,
326 std::unordered_map<int32_t, std::shared_ptr<PartitionSpec>> specs_by_id,
327 std::vector<ManifestFile> delete_manifests);
328
329 ~Builder() override;
330
331 Builder(Builder&&) noexcept;
332 Builder& operator=(Builder&&) noexcept;
333 Builder(const Builder&) = delete;
334 Builder& operator=(const Builder&) = delete;
335
339 Builder& AfterSequenceNumber(int64_t seq);
340
345 Builder& DataFilter(std::shared_ptr<Expression> filter);
346
348 Builder& PartitionFilter(std::shared_ptr<Expression> filter);
349
351 Builder& FilterPartitions(std::shared_ptr<PartitionSet> partition_set);
352
354 Builder& CaseSensitive(bool case_sensitive);
355
357 Builder& IgnoreResiduals();
358
360 Result<std::unique_ptr<DeleteFileIndex>> Build();
361
362 private:
363 // Load delete files from manifests
364 Result<std::vector<ManifestEntry>> LoadDeleteFiles();
365
366 // Add a DV to the index
367 Status AddDV(std::unordered_map<std::string, ManifestEntry>& dv_by_path,
368 ManifestEntry&& entry);
369
370 // Add a position delete file to the index
371 Status AddPositionDelete(
372 std::unordered_map<std::string, std::unique_ptr<internal::PositionDeletes>>&
373 deletes_by_path,
374 PartitionMap<std::unique_ptr<internal::PositionDeletes>>& deletes_by_partition,
375 ManifestEntry&& entry);
376
377 // Add an equality delete file to the index
378 Status AddEqualityDelete(
379 internal::EqualityDeletes& global_deletes,
380 PartitionMap<std::unique_ptr<internal::EqualityDeletes>>& deletes_by_partition,
381 ManifestEntry&& entry);
382
383 std::shared_ptr<FileIO> io_;
384 std::shared_ptr<Schema> schema_;
385 std::unordered_map<int32_t, std::shared_ptr<PartitionSpec>> specs_by_id_;
386 std::vector<ManifestFile> delete_manifests_;
387 int64_t min_sequence_number_ = 0;
388 std::shared_ptr<Expression> data_filter_;
389 std::shared_ptr<Expression> partition_filter_;
390 std::shared_ptr<PartitionSet> partition_set_;
391 bool case_sensitive_ = true;
392 bool ignore_residuals_ = false;
393};
394
395} // namespace iceberg
An index of delete files by sequence number.
Definition delete_file_index.h:228
Builder(std::shared_ptr< FileIO > io, std::shared_ptr< Schema > schema, std::unordered_map< int32_t, std::shared_ptr< PartitionSpec > > specs_by_id, std::vector< ManifestFile > delete_manifests)
Construct a builder from manifest files.
Base class for collecting errors in the builder pattern.
Definition error_collector.h:93
Represents a boolean expression tree.
Definition expression.h:37
Pluggable module for reading, writing, and deleting files.
Definition file_io.h:115
Literal is a literal value that is associated with a primitive type.
Definition literal.h:39
A map that uses a pair of spec ID and partition tuple as keys.
Definition partition_value_util.h:115
A set that uses a pair of spec ID and partition tuple as elements.
Definition partition_value_util.h:204
A partition spec for a Table.
Definition partition_spec.h:47
A schema for a Table.
Definition schema.h:49
A group of equality delete files sorted by apply sequence number.
Definition delete_file_index.h:185
bool empty() const
Check if this group is empty.
Definition delete_file_index.h:204
A group of position delete files sorted by the sequence number they apply to.
Definition delete_file_index.h:156
bool empty() const
Check if this group is empty.
Definition delete_file_index.h:171
bool AllNull(const std::map< int32_t, int64_t > &null_counts, const std::map< int32_t, int64_t > &value_counts, int32_t field_id, bool is_required)
Check if a value count map indicates all values are null.
Definition delete_file_index.h:102
bool RangesOverlap(const Literal &data_lower, const Literal &data_upper, const Literal &delete_lower, const Literal &delete_upper)
Check if two ranges overlap.
Definition delete_file_index.h:90
bool ContainsNull(const std::map< int32_t, int64_t > &null_counts, int32_t field_id, bool is_required)
Check if the column contains any null values.
Definition delete_file_index.h:134
bool AllNonNull(const std::map< int32_t, int64_t > &null_counts, int32_t field_id, bool is_required)
Check if all values are non-null.
Definition delete_file_index.h:119
STL namespace.
DataFile carries data file path, partition tuple, metrics, ...
Definition manifest_entry.h:62
A manifest is an immutable Avro file that lists data files or delete files, along with each file's pa...
Definition manifest_entry.h:307
std::optional< int64_t > sequence_number
Definition manifest_entry.h:328
std::shared_ptr< DataFile > data_file
Definition manifest_entry.h:344
Entry in a manifest list.
Definition manifest_list.h:85
Wrapper for equality delete files that caches converted bounds.
Definition delete_file_index.h:45
Result< std::optional< std::reference_wrapper< const Literal > > > UpperBound(int32_t id) const
Get the upper bound for a field ID.
Definition delete_file_index.h:76
Result< std::optional< std::reference_wrapper< const Literal > > > LowerBound(int32_t id) const
Get the lower bound for a field ID.
Definition delete_file_index.h:67
bool HasLowerAndUpperBounds() const
Check if this delete file has both lower and upper bounds.
Definition delete_file_index.h:61