iceberg-cpp
Loading...
Searching...
No Matches
manifest_entry.h
Go to the documentation of this file.
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#pragma once
21
23
24#include <cstdint>
25#include <map>
26#include <memory>
27#include <optional>
28#include <string>
29#include <vector>
30
31#include "iceberg/file_format.h"
32#include "iceberg/iceberg_export.h"
33#include "iceberg/result.h"
36#include "iceberg/type.h"
37
38namespace iceberg {
39
40enum class ManifestStatus {
41 kExisting = 0,
42 kAdded = 1,
43 kDeleted = 2,
44};
45
47ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt(
48 int32_t status) noexcept {
49 switch (status) {
50 case 0:
51 return ManifestStatus::kExisting;
52 case 1:
53 return ManifestStatus::kAdded;
54 case 2:
55 return ManifestStatus::kDeleted;
56 default:
57 return InvalidArgument("Invalid manifest status: {}", status);
58 }
59}
60
62struct ICEBERG_EXPORT DataFile {
64 enum class Content {
65 kData = 0,
66 kPositionDeletes = 1,
67 kEqualityDeletes = 2,
68 };
69
73 Content content = Content::kData;
76 std::string file_path;
79 FileFormatType file_format = FileFormatType::kParquet;
86 int64_t record_count = 0;
89 int64_t file_size_in_bytes = 0;
96 std::map<int32_t, int64_t> column_sizes;
101 std::map<int32_t, int64_t> value_counts;
106 std::map<int32_t, int64_t> null_value_counts;
111 std::map<int32_t, int64_t> nan_value_counts;
118 std::map<int32_t, std::vector<uint8_t>> lower_bounds;
125 std::map<int32_t, std::vector<uint8_t>> upper_bounds;
128 std::vector<uint8_t> key_metadata;
133 std::vector<int64_t> split_offsets;
139 std::vector<int32_t> equality_ids;
148 std::optional<int32_t> sort_order_id;
155 std::optional<int64_t> first_row_id;
163 std::optional<std::string> referenced_data_file;
171 std::optional<int64_t> content_offset;
175 std::optional<int64_t> content_size_in_bytes;
176
180 std::optional<int32_t> partition_spec_id;
181
182 static constexpr int32_t kContentFieldId = 134;
183 inline static const SchemaField kContent = SchemaField::MakeOptional(
184 kContentFieldId, "content", int32(),
185 "Contents of the file: 0=data, 1=position deletes, 2=equality deletes");
186
187 static constexpr int32_t kFilePathFieldId = 100;
188 inline static const SchemaField kFilePath = SchemaField::MakeRequired(
189 kFilePathFieldId, "file_path", string(), "Location URI with FS scheme");
190
191 static constexpr int32_t kFileFormatFieldId = 101;
192 inline static const SchemaField kFileFormat =
193 SchemaField::MakeRequired(kFileFormatFieldId, "file_format", string(),
194 "File format name: avro, orc, or parquet");
195
196 static constexpr int32_t kPartitionFieldId = 102;
197 inline static const std::string kPartitionField = "partition";
198 inline static const std::string kPartitionDoc =
199 "Partition data tuple, schema based on the partition spec";
200
201 static constexpr int32_t kRecordCountFieldId = 103;
202 inline static const SchemaField kRecordCount = SchemaField::MakeRequired(
203 kRecordCountFieldId, "record_count", int64(), "Number of records in the file");
204
205 static constexpr int32_t kFileSizeFieldId = 104;
206 inline static const SchemaField kFileSize = SchemaField::MakeRequired(
207 kFileSizeFieldId, "file_size_in_bytes", int64(), "Total file size in bytes");
208
209 static constexpr int32_t kColumnSizesFieldId = 108;
210 inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
211 kColumnSizesFieldId, "column_sizes",
212 map(SchemaField::MakeRequired(117, std::string(MapType::kKeyName), int32()),
213 SchemaField::MakeRequired(118, std::string(MapType::kValueName), int64())),
214 "Map of column id to total size on disk");
215
216 static constexpr int32_t kValueCountsFieldId = 109;
217 inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
218 kValueCountsFieldId, "value_counts",
219 map(SchemaField::MakeRequired(119, std::string(MapType::kKeyName), int32()),
220 SchemaField::MakeRequired(120, std::string(MapType::kValueName), int64())),
221 "Map of column id to total count, including null and NaN");
222
223 static constexpr int32_t kNullValueCountsFieldId = 110;
224 inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
225 kNullValueCountsFieldId, "null_value_counts",
226 map(SchemaField::MakeRequired(121, std::string(MapType::kKeyName), int32()),
227 SchemaField::MakeRequired(122, std::string(MapType::kValueName), int64())),
228 "Map of column id to null value count");
229
230 static constexpr int32_t kNanValueCountsFieldId = 137;
231 inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
232 kNanValueCountsFieldId, "nan_value_counts",
233 map(SchemaField::MakeRequired(138, std::string(MapType::kKeyName), int32()),
234 SchemaField::MakeRequired(139, std::string(MapType::kValueName), int64())),
235 "Map of column id to number of NaN values in the column");
236
237 static constexpr int32_t kLowerBoundsFieldId = 125;
238 inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
239 kLowerBoundsFieldId, "lower_bounds",
240 map(SchemaField::MakeRequired(126, std::string(MapType::kKeyName), int32()),
241 SchemaField::MakeRequired(127, std::string(MapType::kValueName), binary())),
242 "Map of column id to lower bound");
243
244 static constexpr int32_t kUpperBoundsFieldId = 128;
245 inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
246 kUpperBoundsFieldId, "upper_bounds",
247 map(SchemaField::MakeRequired(129, std::string(MapType::kKeyName), int32()),
248 SchemaField::MakeRequired(130, std::string(MapType::kValueName), binary())),
249 "Map of column id to upper bound");
250
251 static constexpr int32_t kKeyMetadataFieldId = 131;
252 inline static const SchemaField kKeyMetadata = SchemaField::MakeOptional(
253 kKeyMetadataFieldId, "key_metadata", binary(), "Encryption key metadata blob");
254
255 static constexpr int32_t kSplitOffsetsFieldId = 132;
256 inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
257 kSplitOffsetsFieldId, "split_offsets",
258 list(SchemaField::MakeRequired(133, std::string(ListType::kElementName), int64())),
259 "Splittable offsets");
260
261 static constexpr int32_t kEqualityIdsFieldId = 135;
262 inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
263 kEqualityIdsFieldId, "equality_ids",
264 list(SchemaField::MakeRequired(136, std::string(ListType::kElementName), int32())),
265 "Equality comparison field IDs");
266
267 static constexpr int32_t kSortOrderIdFieldId = 140;
268 inline static const SchemaField kSortOrderId = SchemaField::MakeOptional(
269 kSortOrderIdFieldId, "sort_order_id", int32(), "Sort order ID");
270
271 static constexpr int32_t kFirstRowIdFieldId = 142;
272 inline static const SchemaField kFirstRowId =
273 SchemaField::MakeOptional(kFirstRowIdFieldId, "first_row_id", int64(),
274 "Starting row ID to assign to new rows");
275
276 static constexpr int32_t kReferencedDataFileFieldId = 143;
277 inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
278 kReferencedDataFileFieldId, "referenced_data_file", string(),
279 "Fully qualified location (URI with FS scheme) of a data file that all deletes "
280 "reference");
281
282 static constexpr int32_t kContentOffsetFieldId = 144;
283 inline static const SchemaField kContentOffset =
284 SchemaField::MakeOptional(kContentOffsetFieldId, "content_offset", int64(),
285 "The offset in the file where the content starts");
286
287 static constexpr int32_t kContentSizeFieldId = 145;
288 inline static const SchemaField kContentSize =
289 SchemaField::MakeOptional(kContentSizeFieldId, "content_size_in_bytes", int64(),
290 "The length of referenced content stored in the file");
291
292 bool operator==(const DataFile& other) const = default;
293
295 static std::shared_ptr<StructType> Type(std::shared_ptr<StructType> partition_type);
296
298 bool IsDeletionVector() const {
299 return content == Content::kPositionDeletes && file_format == FileFormatType::kPuffin;
300 }
301};
302
305
307struct ICEBERG_EXPORT ManifestEntry {
311 ManifestStatus status = ManifestStatus::kAdded;
315 std::optional<int64_t> snapshot_id;
328 std::optional<int64_t> sequence_number;
341 std::optional<int64_t> file_sequence_number;
344 std::shared_ptr<DataFile> data_file;
345
346 static constexpr int32_t kStatusFieldId = 0;
347 inline static const SchemaField kStatus =
348 SchemaField::MakeRequired(kStatusFieldId, "status", int32());
349
350 static constexpr int32_t kSnapshotIdFieldId = 1;
351 inline static const SchemaField kSnapshotId =
352 SchemaField::MakeOptional(kSnapshotIdFieldId, "snapshot_id", int64());
353
354 static constexpr int32_t kDataFileFieldId = 2;
355 inline static const std::string kDataFileField = "data_file";
356
357 static constexpr int32_t kSequenceNumberFieldId = 3;
358 inline static const SchemaField kSequenceNumber =
359 SchemaField::MakeOptional(kSequenceNumberFieldId, "sequence_number", int64());
360
361 static constexpr int32_t kFileSequenceNumberFieldId = 4;
362 inline static const SchemaField kFileSequenceNumber = SchemaField::MakeOptional(
363 kFileSequenceNumberFieldId, "file_sequence_number", int64());
364
366 constexpr bool IsAlive() const {
367 return status == ManifestStatus::kAdded || status == ManifestStatus::kExisting;
368 }
369
370 ManifestEntry AsAdded() const {
371 ManifestEntry copy = *this;
372 copy.status = ManifestStatus::kAdded;
373 if (copy.data_file->first_row_id.has_value()) {
374 copy.data_file = std::make_unique<DataFile>(*copy.data_file);
375 copy.data_file->first_row_id = std::nullopt;
376 }
377 return copy;
378 }
379
380 ManifestEntry AsExisting() const {
381 ManifestEntry copy = *this;
382 copy.status = ManifestStatus::kExisting;
383 return copy;
384 }
385
386 ManifestEntry AsDeleted() const {
387 ManifestEntry copy = *this;
388 copy.status = ManifestStatus::kDeleted;
389 return copy;
390 }
391
392 bool operator==(const ManifestEntry& other) const;
393
394 static std::shared_ptr<StructType> TypeFromPartitionType(
395 std::shared_ptr<StructType> partition_type);
396 static std::shared_ptr<StructType> TypeFromDataFileType(
397 std::shared_ptr<StructType> datafile_type);
398};
399
401ICEBERG_EXPORT constexpr std::string_view ToString(DataFile::Content type) noexcept {
402 switch (type) {
403 case DataFile::Content::kData:
404 return "data";
405 case DataFile::Content::kPositionDeletes:
406 return "position_deletes";
407 case DataFile::Content::kEqualityDeletes:
408 return "equality_deletes";
409 }
410 std::unreachable();
411}
412
414ICEBERG_EXPORT constexpr Result<DataFile::Content> DataFileContentFromInt(
415 int32_t content) noexcept {
416 switch (content) {
417 case 0:
418 return DataFile::Content::kData;
419 case 1:
420 return DataFile::Content::kPositionDeletes;
421 case 2:
422 return DataFile::Content::kEqualityDeletes;
423 default:
424 return InvalidArgument("Invalid data file content: {}", content);
425 }
426}
427
428} // namespace iceberg
StructLike wrapper for a vector of literals that represent partition values.
Definition partition_values.h:36
A type combined with a name.
Definition schema_field.h:39
Interface for a data type for a field.
Definition type.h:44
ICEBERG_EXPORT const std::shared_ptr< BinaryType > & binary()
Return a BinaryType instance.
std::shared_ptr< MapType > map(SchemaField key, SchemaField value)
Create a MapType with the given key and value fields.
Definition type.cc:388
std::shared_ptr< ListType > list(SchemaField element)
Create a ListType with the given element field.
Definition type.cc:392
ICEBERG_EXPORT const std::shared_ptr< IntType > & int32()
Return an IntType instance.
ICEBERG_EXPORT const std::shared_ptr< LongType > & int64()
Return a LongType instance.
ICEBERG_EXPORT constexpr Result< ManifestStatus > ManifestStatusFromInt(int32_t status) noexcept
Get the relative manifest status type from int.
Definition manifest_entry.h:47
ICEBERG_EXPORT constexpr Result< DataFile::Content > DataFileContentFromInt(int32_t content) noexcept
Get the relative data file content type from int.
Definition manifest_entry.h:414
@ kData
The manifest content is data.
DataFile carries data file path, partition tuple, metrics, ...
Definition manifest_entry.h:62
std::optional< int64_t > content_offset
Definition manifest_entry.h:171
std::map< int32_t, std::vector< uint8_t > > lower_bounds
Definition manifest_entry.h:118
std::optional< int64_t > content_size_in_bytes
Definition manifest_entry.h:175
std::optional< int64_t > first_row_id
Definition manifest_entry.h:155
PartitionValues partition
Definition manifest_entry.h:83
std::vector< int32_t > equality_ids
Definition manifest_entry.h:139
Content
Content of a data file.
Definition manifest_entry.h:64
std::map< int32_t, int64_t > column_sizes
Definition manifest_entry.h:96
std::map< int32_t, int64_t > null_value_counts
Definition manifest_entry.h:106
std::optional< int32_t > partition_spec_id
Partition spec id for this data file.
Definition manifest_entry.h:180
std::optional< std::string > referenced_data_file
Definition manifest_entry.h:163
std::vector< uint8_t > key_metadata
Definition manifest_entry.h:128
std::map< int32_t, std::vector< uint8_t > > upper_bounds
Definition manifest_entry.h:125
std::map< int32_t, int64_t > nan_value_counts
Definition manifest_entry.h:111
std::vector< int64_t > split_offsets
Definition manifest_entry.h:133
std::optional< int32_t > sort_order_id
Definition manifest_entry.h:148
bool IsDeletionVector() const
Check if this data file is a deletion vector.
Definition manifest_entry.h:298
std::string file_path
Definition manifest_entry.h:76
std::map< int32_t, int64_t > value_counts
Definition manifest_entry.h:101
A manifest is an immutable Avro file that lists data files or delete files, along with each file's pa...
Definition manifest_entry.h:307
std::optional< int64_t > snapshot_id
Definition manifest_entry.h:315
std::optional< int64_t > file_sequence_number
Definition manifest_entry.h:341
std::optional< int64_t > sequence_number
Definition manifest_entry.h:328
constexpr bool IsAlive() const
Check if this manifest entry is deleted.
Definition manifest_entry.h:366
std::shared_ptr< DataFile > data_file
Definition manifest_entry.h:344
ManifestStatus status
Definition manifest_entry.h:311