iceberg-cpp
Loading...
Searching...
No Matches
snapshot.h
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#pragma once
21
22#include <memory>
23#include <optional>
24#include <span>
25#include <string>
26#include <string_view>
27#include <unordered_map>
28#include <variant>
29
30#include "iceberg/iceberg_export.h"
32#include "iceberg/result.h"
33#include "iceberg/type_fwd.h"
34#include "iceberg/util/lazy.h"
35#include "iceberg/util/timepoint.h"
36
37namespace iceberg {
38
40enum class SnapshotRefType {
44 kBranch,
46 kTag,
47};
48
50ICEBERG_EXPORT constexpr std::string_view ToString(SnapshotRefType type) noexcept {
51 switch (type) {
52 case SnapshotRefType::kBranch:
53 return "branch";
54 case SnapshotRefType::kTag:
55 return "tag";
56 }
57 std::unreachable();
58}
60ICEBERG_EXPORT constexpr Result<SnapshotRefType> SnapshotRefTypeFromString(
61 std::string_view str) noexcept {
62 if (str == "branch") return SnapshotRefType::kBranch;
63 if (str == "tag") return SnapshotRefType::kTag;
64 return InvalidArgument("Invalid snapshot reference type: {}", str);
65}
66
68struct ICEBERG_EXPORT SnapshotRef {
69 static constexpr std::string_view kMainBranch = "main";
70
71 struct ICEBERG_EXPORT Branch {
75 std::optional<int32_t> min_snapshots_to_keep;
79 std::optional<int64_t> max_snapshot_age_ms;
83 std::optional<int64_t> max_ref_age_ms;
84
86 friend bool operator==(const Branch& lhs, const Branch& rhs) {
87 return lhs.Equals(rhs);
88 }
89
90 private:
92 bool Equals(const Branch& other) const;
93 };
94
95 struct ICEBERG_EXPORT Tag {
99 std::optional<int64_t> max_ref_age_ms;
100
102 friend bool operator==(const Tag& lhs, const Tag& rhs) { return lhs.Equals(rhs); }
103
104 private:
106 bool Equals(const Tag& other) const;
107 };
108
110 int64_t snapshot_id;
112 std::variant<Branch, Tag> retention;
113
114 SnapshotRefType type() const noexcept;
115
116 std::optional<int64_t> max_ref_age_ms() const noexcept;
117
126 static Result<std::unique_ptr<SnapshotRef>> MakeBranch(
127 int64_t snapshot_id, std::optional<int32_t> min_snapshots_to_keep = std::nullopt,
128 std::optional<int64_t> max_snapshot_age_ms = std::nullopt,
129 std::optional<int64_t> max_ref_age_ms = std::nullopt);
130
137 static Result<std::unique_ptr<SnapshotRef>> MakeTag(
138 int64_t snapshot_id, std::optional<int64_t> max_ref_age_ms = std::nullopt);
139
145 std::unique_ptr<SnapshotRef> Clone(
146 std::optional<int64_t> new_snapshot_id = std::nullopt) const;
147
149 Status Validate() const;
150
152 friend bool operator==(const SnapshotRef& lhs, const SnapshotRef& rhs) {
153 return lhs.Equals(rhs);
154 }
155
156 private:
158 bool Equals(const SnapshotRef& other) const;
159};
160
162struct ICEBERG_EXPORT SnapshotSummaryFields {
164 inline static const std::string kOperation = "operation";
166 inline static const std::string kFirstRowId = "first-row-id";
168 inline static const std::string kAddedRows = "added-rows";
169
171
173 inline static const std::string kAddedDataFiles = "added-data-files";
175 inline static const std::string kDeletedDataFiles = "deleted-data-files";
177 inline static const std::string kTotalDataFiles = "total-data-files";
180 inline static const std::string kAddedDeleteFiles = "added-delete-files";
182 inline static const std::string kAddedEqDeleteFiles = "added-equality-delete-files";
184 inline static const std::string kRemovedEqDeleteFiles = "removed-equality-delete-files";
186 inline static const std::string kAddedPosDeleteFiles = "added-position-delete-files";
188 inline static const std::string kRemovedPosDeleteFiles =
189 "removed-position-delete-files";
191 inline static const std::string kAddedDVs = "added-dvs";
193 inline static const std::string kRemovedDVs = "removed-dvs";
196 inline static const std::string kRemovedDeleteFiles = "removed-delete-files";
199 inline static const std::string kTotalDeleteFiles = "total-delete-files";
201 inline static const std::string kAddedRecords = "added-records";
203 inline static const std::string kDeletedRecords = "deleted-records";
205 inline static const std::string kTotalRecords = "total-records";
207 inline static const std::string kAddedFileSize = "added-files-size";
209 inline static const std::string kRemovedFileSize = "removed-files-size";
211 inline static const std::string kTotalFileSize = "total-files-size";
213 inline static const std::string kAddedPosDeletes = "added-position-deletes";
215 inline static const std::string kRemovedPosDeletes = "removed-position-deletes";
217 inline static const std::string kTotalPosDeletes = "total-position-deletes";
219 inline static const std::string kAddedEqDeletes = "added-equality-deletes";
221 inline static const std::string kRemovedEqDeletes = "removed-equality-deletes";
223 inline static const std::string kTotalEqDeletes = "total-equality-deletes";
226 inline static const std::string kDeletedDuplicatedFiles = "deleted-duplicate-files";
228 inline static const std::string kChangedPartitionCountProp = "changed-partition-count";
230 inline static const std::string kManifestsCreated = "manifests-created";
232 inline static const std::string kManifestsKept = "manifests-kept";
234 inline static const std::string kManifestsReplaced = "manifests-replaced";
236 inline static const std::string kEntriesProcessed = "entries-processed";
238 inline static const std::string kChangedPartitionPrefix = "partitions.";
240 inline static const std::string kPartitionSummaryProp = "partition-summaries-included";
241
243
245 inline static const std::string kWAPId = "wap.id";
247 inline static const std::string kPublishedWAPId = "published-wap-id";
249 inline static const std::string kSourceSnapshotId = "source-snapshot-id";
251 inline static const std::string kEngineName = "engine-name";
253 inline static const std::string kEngineVersion = "engine-version";
254};
255
260class ICEBERG_EXPORT SnapshotSummaryBuilder {
261 private:
263 class UpdateMetrics {
264 public:
265 void Clear();
266 void AddTo(std::unordered_map<std::string, std::string>& builder) const;
267 void AddedFile(const DataFile& file);
268 void RemovedFile(const DataFile& file);
269 void AddedManifest(const ManifestFile& manifest);
270 void Merge(const UpdateMetrics& other);
271
272 private:
273 int64_t added_size_{0};
274 int64_t removed_size_{0};
275 int32_t added_files_{0};
276 int32_t removed_files_{0};
277 int32_t added_eq_delete_files_{0};
278 int32_t removed_eq_delete_files_{0};
279 int32_t added_pos_delete_files_{0};
280 int32_t removed_pos_delete_files_{0};
281 int32_t added_dvs_{0};
282 int32_t removed_dvs_{0};
283 int32_t added_delete_files_{0};
284 int32_t removed_delete_files_{0};
285 int64_t added_records_{0};
286 int64_t deleted_records_{0};
287 int64_t added_pos_deletes_{0};
288 int64_t removed_pos_deletes_{0};
289 int64_t added_eq_deletes_{0};
290 int64_t removed_eq_deletes_{0};
291 bool trust_size_and_delete_counts_{true};
292 };
293
294 public:
295 SnapshotSummaryBuilder() = default;
296
298 void Clear();
299
309 void SetPartitionSummaryLimit(int32_t max);
310
314 void IncrementDuplicateDeletes(int32_t increment = 1);
315
321 Status AddedFile(const PartitionSpec& spec, const DataFile& file);
322
328 Status DeletedFile(const PartitionSpec& spec, const DataFile& file);
329
333 void AddedManifest(const ManifestFile& manifest);
334
339 void Set(const std::string& property, const std::string& value);
340
344 void Merge(const SnapshotSummaryBuilder& other);
345
349 std::unordered_map<std::string, std::string> Build() const;
350
351 private:
352 Status UpdatePartitions(const PartitionSpec& spec, const DataFile& file,
353 bool is_addition);
354 std::string PartitionSummary(const UpdateMetrics& metrics) const;
355
356 std::unordered_map<std::string, std::string> properties_;
357 std::unordered_map<std::string, UpdateMetrics> partition_metrics_;
358 UpdateMetrics metrics_;
359 int32_t max_changed_partitions_for_summaries_{0};
360 int64_t deleted_duplicate_files_{0};
361 bool trust_partition_metrics_{true};
362};
363
369struct ICEBERG_EXPORT DataOperation {
371 inline static const std::string kAppend = "append";
374 inline static const std::string kReplace = "replace";
377 inline static const std::string kOverwrite = "overwrite";
380 inline static const std::string kDelete = "delete";
381};
382
389struct ICEBERG_EXPORT Snapshot {
391 int64_t snapshot_id;
393 std::optional<int64_t> parent_snapshot_id;
398 TimePointMs timestamp_ms;
401 std::string manifest_list;
403 std::unordered_map<std::string, std::string> summary;
405 std::optional<int32_t> schema_id;
406
408 static Result<std::unique_ptr<Snapshot>> Make(
409 int64_t sequence_number, int64_t snapshot_id,
410 std::optional<int64_t> parent_snapshot_id, TimePointMs timestamp_ms,
411 std::string operation, std::unordered_map<std::string, std::string> summary,
412 std::optional<int32_t> schema_id, std::string manifest_list,
413 std::optional<int64_t> first_row_id = std::nullopt,
414 std::optional<int64_t> added_rows = std::nullopt);
415
421 std::optional<std::string_view> Operation() const;
422
432 Result<std::optional<int64_t>> FirstRowId() const;
433
444 Result<std::optional<int64_t>> AddedRows() const;
445
447 friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) {
448 return lhs.Equals(rhs);
449 }
450
451 private:
453 bool Equals(const Snapshot& other) const;
454};
455
459class ICEBERG_EXPORT SnapshotCache {
460 public:
461 explicit SnapshotCache(const Snapshot* snapshot) : snapshot_(snapshot) {}
462
464 const Snapshot& snapshot() const { return *snapshot_; }
465
471 Result<std::span<ManifestFile>> Manifests(std::shared_ptr<FileIO> file_io) const;
472
477 Result<std::span<ManifestFile>> DataManifests(std::shared_ptr<FileIO> file_io) const;
478
483 Result<std::span<ManifestFile>> DeleteManifests(std::shared_ptr<FileIO> file_io) const;
484
485 private:
490 using ManifestsCache = std::pair<std::vector<ManifestFile>, size_t>;
491
496 static Result<ManifestsCache> InitManifestsCache(const Snapshot* snapshot,
497 std::shared_ptr<FileIO> file_io);
498
500 const Snapshot* snapshot_;
501
503 Lazy<InitManifestsCache> manifests_cache_;
504};
505
506} // namespace iceberg
A partition spec for a Table.
Definition partition_spec.h:47
Maintains statistics for each partition field and produces the partition field summaries.
Definition partition_summary_internal.h:51
A snapshot with cached manifest loading capabilities.
Definition snapshot.h:459
const Snapshot & snapshot() const
Get the underlying Snapshot reference.
Definition snapshot.h:464
Helper class for building snapshot summaries.
Definition snapshot.h:260
STL namespace.
DataFile carries data file path, partition tuple, metrics, ...
Definition manifest_entry.h:62
Data operation that produce snapshots.
Definition snapshot.h:369
Entry in a manifest list.
Definition manifest_list.h:85
Definition snapshot.h:71
std::optional< int32_t > min_snapshots_to_keep
Definition snapshot.h:75
std::optional< int64_t > max_snapshot_age_ms
Definition snapshot.h:79
std::optional< int64_t > max_ref_age_ms
Definition snapshot.h:83
friend bool operator==(const Branch &lhs, const Branch &rhs)
Compare two branches for equality.
Definition snapshot.h:86
Definition snapshot.h:95
std::optional< int64_t > max_ref_age_ms
Definition snapshot.h:99
friend bool operator==(const Tag &lhs, const Tag &rhs)
Compare two tags for equality.
Definition snapshot.h:102
A reference to a snapshot, either a branch or a tag.
Definition snapshot.h:68
int64_t snapshot_id
A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch.
Definition snapshot.h:110
std::variant< Branch, Tag > retention
Snapshot retention policy.
Definition snapshot.h:112
Optional Snapshot Summary Fields.
Definition snapshot.h:162
A snapshot of the data in a table at a point in time.
Definition snapshot.h:389
friend bool operator==(const Snapshot &lhs, const Snapshot &rhs)
Compare two snapshots for equality.
Definition snapshot.h:447
int64_t snapshot_id
A unique long ID.
Definition snapshot.h:391
int64_t sequence_number
A monotonically increasing long that tracks the order of changes to a table.
Definition snapshot.h:395
std::optional< int32_t > schema_id
ID of the table's current schema when the snapshot was created.
Definition snapshot.h:405
TimePointMs timestamp_ms
Definition snapshot.h:398
std::optional< int64_t > parent_snapshot_id
The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent.
Definition snapshot.h:393
std::string manifest_list
Definition snapshot.h:401
std::unordered_map< std::string, std::string > summary
A string map that summaries the snapshot changes, including operation.
Definition snapshot.h:403