iceberg-cpp
Loading...
Searching...
No Matches
schema.h
Go to the documentation of this file.
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#pragma once
21
25
26#include <cstdint>
27#include <optional>
28#include <string>
29#include <unordered_map>
30#include <unordered_set>
31#include <vector>
32
33#include "iceberg/iceberg_export.h"
34#include "iceberg/result.h"
36#include "iceberg/type.h"
37#include "iceberg/util/lazy.h"
38#include "iceberg/util/string_util.h"
39
40namespace iceberg {
41
42class SchemaCache;
43
49class ICEBERG_EXPORT Schema : public StructType {
50 public:
51 static constexpr int32_t kInitialSchemaId = 0;
52 static constexpr int32_t kInitialColumnId = 0;
53 static constexpr int32_t kInvalidColumnId = -1;
54
56 static constexpr std::string_view kAllColumns = "*";
57
58 explicit Schema(std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId);
59
66 static Result<std::unique_ptr<Schema>> Make(std::vector<SchemaField> fields,
67 int32_t schema_id,
68 std::vector<int32_t> identifier_field_ids);
69
77 static Result<std::unique_ptr<Schema>> Make(
78 std::vector<SchemaField> fields, int32_t schema_id,
79 const std::vector<std::string>& identifier_field_names);
80
96 static Status ValidateIdentifierFields(
97 int32_t field_id, const Schema& schema,
98 const std::unordered_map<int32_t, int32_t>& id_to_parent);
99
103 static const std::shared_ptr<Schema>& EmptySchema();
104
109 int32_t schema_id() const;
110
111 std::string ToString() const override;
112
122 Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldByName(
123 std::string_view name, bool case_sensitive = true) const;
124
129 Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
130 int32_t field_id) const;
131
137 Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) const;
138
143 Result<std::unique_ptr<StructLikeAccessor>> GetAccessorById(int32_t field_id) const;
144
152 Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
153 bool case_sensitive = true) const;
154
161 Result<std::unique_ptr<Schema>> Project(
162 const std::unordered_set<int32_t>& field_ids) const;
163
165 const std::vector<int32_t>& IdentifierFieldIds() const;
166
168 Result<std::vector<std::string>> IdentifierFieldNames() const;
169
172 Result<int32_t> HighestFieldId() const;
173
176 bool SameSchema(const Schema& other) const;
177
185 Status Validate(int32_t format_version) const;
186
187 friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
188
189 private:
190 using StructType::Equals;
192 bool Equals(const Schema& other) const;
193
194 const int32_t schema_id_;
195 // Field IDs that uniquely identify rows in the table.
196 std::vector<int32_t> identifier_field_ids_;
197 // Cache for schema mappings to facilitate fast lookups.
198 std::unique_ptr<SchemaCache> cache_;
199};
200
201// Cache for schema mappings to facilitate fast lookups.
202class ICEBERG_EXPORT SchemaCache {
203 public:
204 explicit SchemaCache(const Schema* schema) : schema_(schema) {}
205
206 using IdToFieldMap =
207 std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>;
208 using IdToFieldMapRef = std::reference_wrapper<const IdToFieldMap>;
209
210 struct NameIdMap {
216 std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
217
222 std::unordered_map<int32_t, std::string> id_to_name;
223 };
224 using NameIdMapRef = std::reference_wrapper<const NameIdMap>;
225
226 using LowercaseNameToIdMap =
227 std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>;
228 using LowercaseNameToIdMapRef = std::reference_wrapper<const LowercaseNameToIdMap>;
229
230 using IdToPositionPathMap = std::unordered_map<int32_t, std::vector<size_t>>;
231 using IdToPositionPathMapRef = std::reference_wrapper<const IdToPositionPathMap>;
232
233 Result<IdToFieldMapRef> GetIdToFieldMap() const;
234 Result<NameIdMapRef> GetNameIdMap() const;
235 Result<LowercaseNameToIdMapRef> GetLowercaseNameToIdMap() const;
236 Result<IdToPositionPathMapRef> GetIdToPositionPathMap() const;
237 Result<int32_t> GetHighestFieldId() const;
238
239 private:
240 static Result<IdToFieldMap> InitIdToFieldMap(const Schema* schema);
241 static Result<NameIdMap> InitNameIdMap(const Schema* schema);
242 static Result<LowercaseNameToIdMap> InitLowerCaseNameToIdMap(const Schema* schema);
243 static Result<IdToPositionPathMap> InitIdToPositionPath(const Schema* schema);
244 static Result<int32_t> InitHighestFieldId(const Schema* schema);
245
246 const Schema* schema_;
247 // Mapping from field id to field.
248 Lazy<InitIdToFieldMap> id_to_field_;
249 // Mapping from field name to field id.
250 Lazy<InitNameIdMap> name_id_map_;
251 // Mapping from lowercased field name to field id.
252 Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
253 // Mapping from field id to (nested) position path to access the field.
254 Lazy<InitIdToPositionPath> id_to_position_path_;
255 // Highest field ID in the schema.
256 Lazy<InitHighestFieldId> highest_field_id_;
257};
258
259} // namespace iceberg
Definition schema.h:202
A schema for a Table.
Definition schema.h:49
A data type representing a struct with nested fields.
Definition type.h:108
Definition schema.h:210
std::unordered_map< int32_t, std::string > id_to_name
Mapping from field ID to canonical name.
Definition schema.h:222
std::unordered_map< std::string, int32_t, StringHash, std::equal_to<> > name_to_id
Mapping from canonical field name to ID.
Definition schema.h:216