iceberg-cpp
Loading...
Searching...
No Matches
file_reader.h
Go to the documentation of this file.
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#pragma once
21
24
25#include <functional>
26#include <memory>
27#include <optional>
28
30#include "iceberg/file_format.h"
31#include "iceberg/result.h"
32#include "iceberg/type_fwd.h"
33#include "iceberg/util/config.h"
34
35namespace iceberg {
36
38class ICEBERG_EXPORT Reader {
39 public:
40 virtual ~Reader() = default;
41 Reader() = default;
42 Reader(const Reader&) = delete;
43 Reader& operator=(const Reader&) = delete;
44
46 virtual Status Open(const ReaderOptions& options) = 0;
47
49 virtual Status Close() = 0;
50
54 virtual Result<std::optional<ArrowArray>> Next() = 0;
55
57 virtual Result<ArrowSchema> Schema() = 0;
58
60 virtual Result<std::unordered_map<std::string, std::string>> Metadata() = 0;
61};
62
64struct ICEBERG_EXPORT Split {
66 size_t offset;
68 size_t length;
69};
70
71class ReaderProperties : public ConfigBase<ReaderProperties> {
72 public:
73 template <typename T>
74 using Entry = const ConfigBase<ReaderProperties>::Entry<T>;
75
77 inline static Entry<int64_t> kBatchSize{"read.batch-size", 4096};
81 inline static Entry<bool> kAvroSkipDatum{"read.avro.skip-datum", true};
83 inline static Entry<int64_t> kAvroBufferSize{"read.avro.buffer-size", 1024 * 1024};
84
87 const std::unordered_map<std::string, std::string>& properties);
88};
89
91struct ICEBERG_EXPORT ReaderOptions {
93 std::string path;
95 std::optional<size_t> length;
97 std::optional<Split> split;
99 std::shared_ptr<class FileIO> io;
101 std::shared_ptr<class Schema> projection;
104 std::shared_ptr<class Expression> filter;
107 std::shared_ptr<class NameMapping> name_mapping;
110};
111
113using ReaderFactory = std::function<Result<std::unique_ptr<Reader>>()>;
114
116struct ICEBERG_EXPORT ReaderFactoryRegistry {
118 ReaderFactoryRegistry(FileFormatType format_type, ReaderFactory factory);
119
121 static ReaderFactory& GetFactory(FileFormatType format_type);
122
124 static Result<std::unique_ptr<Reader>> Open(FileFormatType format_type,
125 const ReaderOptions& options);
126};
127
128} // namespace iceberg
Definition config.h:70
Definition config.h:67
Definition file_reader.h:71
static ReaderProperties FromMap(const std::unordered_map< std::string, std::string > &properties)
Create a ReaderProperties instance from a map of key-value pairs.
Definition file_reader.cc:62
static Entry< int64_t > kAvroBufferSize
The buffer size used by Avro input stream.
Definition file_reader.h:83
static Entry< int64_t > kBatchSize
The batch size to read.
Definition file_reader.h:77
static Entry< bool > kAvroSkipDatum
Skip GenericDatum in Avro reader for better performance. When true, decode directly from Avro to Arro...
Definition file_reader.h:81
Base reader class to read data from different file formats.
Definition file_reader.h:38
virtual Result< std::optional< ArrowArray > > Next()=0
Read next data from file.
virtual Result< ArrowSchema > Schema()=0
Get the schema of the data.
virtual Result< std::unordered_map< std::string, std::string > > Metadata()=0
Get the metadata of the file.
virtual Status Open(const ReaderOptions &options)=0
Open the reader.
virtual Status Close()=0
Close the reader.
std::function< Result< std::unique_ptr< Reader > >()> ReaderFactory
Factory function to create a reader of a specific file format.
Definition file_reader.h:113
Registry of reader factories for different file formats.
Definition file_reader.h:116
Options for creating a reader.
Definition file_reader.h:91
std::shared_ptr< class Schema > projection
The projection schema to read from the file. This field is required.
Definition file_reader.h:101
std::shared_ptr< class FileIO > io
FileIO instance to open the file.
Definition file_reader.h:99
std::optional< size_t > length
The total length of the file.
Definition file_reader.h:95
std::string path
The path to the file to read.
Definition file_reader.h:93
std::shared_ptr< class NameMapping > name_mapping
Name mapping for schema evolution compatibility. Used when reading files that may have different fiel...
Definition file_reader.h:107
ReaderProperties properties
Format-specific or implementation-specific properties.
Definition file_reader.h:109
std::optional< Split > split
The split to read.
Definition file_reader.h:97
std::shared_ptr< class Expression > filter
The filter to apply to the data. Reader implementations may ignore this if the file format does not s...
Definition file_reader.h:104
A split of the file to read.
Definition file_reader.h:64
size_t offset
The offset of the split.
Definition file_reader.h:66
size_t length
The length of the split.
Definition file_reader.h:68