Java源码示例:parquet.schema.MessageType
示例1
private ExaParquetWriterImpl(final MessageType schema,
final int numColumns,
final Configuration conf,
final Path path,
final String compressionType,
final ExaIterator exa,
final int firstColumnIndex,
final List<Integer> dynamicPartitionExaColNums) throws Exception {
System.out.println("Path: " + path.toString());
System.out.println("Parquet schema:\n" + schema);
TupleWriteSupport.setSchema(schema, conf);
this.writer = new ParquetWriter<>(path,
new TupleWriteSupport(),
CompressionCodecName.fromConf(compressionType),
ParquetWriter.DEFAULT_BLOCK_SIZE,
ParquetWriter.DEFAULT_PAGE_SIZE,
ParquetWriter.DEFAULT_PAGE_SIZE,
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
conf);
// Create Tuple object with ExaIterator reference.
this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}
示例2
public ExaParquetWriterImpl(final List<ExaParquetTypeInfo> schemaTypes,
final Configuration conf,
final Path path,
final String compressionType,
final ExaIterator exa,
final int firstColumnIndex,
final List<Integer> dynamicPartitionExaColNums) throws Exception {
// Use the schemaTypes provided since HCat table metadata isn't available.
// This should normally only be used for testing.
this(new MessageType("hive_schema", ExaParquetWriterImpl.typeInfoToParquetTypes(schemaTypes)),
schemaTypes.size(),
conf,
path,
compressionType,
exa,
firstColumnIndex,
dynamicPartitionExaColNums);
}
示例3
private Type buildSchema() {
JsonArray inputSchema = this.jsonSchema.getDataTypeValues();
List<Type> parquetTypes = new ArrayList<>();
for (JsonElement element : inputSchema) {
JsonObject map = (JsonObject) element;
JsonSchema elementSchema = new JsonSchema(map);
String columnName = elementSchema.getColumnName();
JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false);
Type schemaType = converter.schema();
this.converters.put(columnName, converter);
parquetTypes.add(schemaType);
}
String docName = this.jsonSchema.getColumnName();
switch (recordType) {
case ROOT:
return new MessageType(docName, parquetTypes);
case CHILD:
return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes);
default:
throw new RuntimeException("Unsupported Record type");
}
}
示例4
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
return inClassloader( () -> {
Configuration conf = job.getConfiguration();
S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
FileSystem fs = FileSystem.get( filePath.toUri(), conf );
FileStatus fileStatus = fs.getFileStatus( filePath );
List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
if ( footers.isEmpty() ) {
return new ArrayList<>();
} else {
ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
MessageType schema = meta.getFileMetaData().getSchema();
return ParquetConverter.buildInputFields( schema );
}
} );
}
示例5
@Override
public ReadContext init( InitContext context ) {
String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
if ( schemaStr == null ) {
throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
}
ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
converter = new ParquetConverter( schema.getFields() );
// get all fields from file's schema
MessageType fileSchema = context.getFileSchema();
List<Type> newFields = new ArrayList<>();
// use only required fields
for ( IParquetInputField f : schema ) {
Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
newFields.add( origField );
}
if ( newFields.isEmpty() ) {
throw new RuntimeException( "Fields should be declared" );
}
MessageType newSchema = new MessageType( fileSchema.getName(), newFields );
return new ReadContext( newSchema, new HashMap<>() );
}
示例6
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
String name = Strings.repeat(".", depth) + type.getName();
OriginalType otype = type.getOriginalType();
Repetition rep = type.getRepetition();
PrimitiveTypeName ptype = type.getPrimitiveTypeName();
out.format("%s: %s %s", name, rep, ptype);
if (otype != null) out.format(" O:%s", otype);
if (container != null) {
cpath.add(type.getName());
String[] paths = cpath.toArray(new String[cpath.size()]);
cpath.remove(cpath.size() - 1);
ColumnDescriptor desc = container.getColumnDescription(paths);
int defl = desc.getMaxDefinitionLevel();
int repl = desc.getMaxRepetitionLevel();
out.format(" R:%d D:%d", repl, defl);
}
out.println();
}
示例7
public ParquetReader(MessageType fileSchema,
MessageType requestedSchema,
List<BlockMetaData> blocks,
ParquetDataSource dataSource,
TypeManager typeManager)
{
this.fileSchema = fileSchema;
this.requestedSchema = requestedSchema;
this.blocks = blocks;
this.dataSource = dataSource;
this.typeManager = typeManager;
initializeColumnReaders();
}
示例8
private static MessageType readParquetSchema(List<SchemaElement> schema)
{
Iterator<SchemaElement> schemaIterator = schema.iterator();
SchemaElement rootSchema = schemaIterator.next();
Types.MessageTypeBuilder builder = Types.buildMessage();
readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
return builder.named(rootSchema.name);
}
示例9
private Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
if (messageType.containsField(column.getName())) {
return messageType.getType(column.getName());
}
// parquet is case-insensitive, all hdfs-columns get converted to lowercase
for (Type type : messageType.getFields()) {
if (type.getName().equalsIgnoreCase(column.getName())) {
return type;
}
}
return null;
}
示例10
public ParaflowPageSource(
ParquetReader parquetReader,
ParquetDataSource dataSource,
MessageType fileSchema,
MessageType requestedSchema,
long totalBytes,
List<ParaflowColumnHandle> columns,
TypeManager typeManager)
{
checkArgument(totalBytes >= 0, "totalBytes is negative");
this.parquetReader = requireNonNull(parquetReader, "parquetReader is null");
this.dataSource = requireNonNull(dataSource, "dataSource is null");
this.fileSchema = requireNonNull(fileSchema, "fileSchema is null");
this.requestedSchema = requireNonNull(requestedSchema, "requestedSchema is null");
this.totalBytes = totalBytes;
this.columnSize = columns.size();
this.constantBlocks = new Block[columnSize];
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
for (int columnIndex = 0; columnIndex < columnSize; columnIndex++) {
ParaflowColumnHandle column = columns.get(columnIndex);
String name = column.getName();
Type type = typeManager.getType(column.getType().getTypeSignature());
namesBuilder.add(name);
typesBuilder.add(type);
if (getParquetType(column, fileSchema) == null) {
constantBlocks[columnIndex] = RunLengthEncodedBlock.create(type, null, MAX_VECTOR_LENGTH);
}
}
columnNames = namesBuilder.build();
types = typesBuilder.build();
}
示例11
private parquet.schema.Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
if (messageType.containsField(column.getName())) {
return messageType.getType(column.getName());
}
// parquet is case-insensitive, all hdfs-columns get converted to lowercase
for (parquet.schema.Type type : messageType.getFields()) {
if (type.getName().equalsIgnoreCase(column.getName())) {
return type;
}
}
return null;
}
示例12
public int getFieldIndex(MessageType fileSchema, String name)
{
try {
return fileSchema.getFieldIndex(name);
}
catch (InvalidRecordException e) {
for (parquet.schema.Type type : fileSchema.getFields()) {
if (type.getName().equalsIgnoreCase(name)) {
return fileSchema.getFieldIndex(type.getName());
}
}
return -1;
}
}
示例13
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
throws IOException {
ArrayList<FileStatus> files = new ArrayList<FileStatus>();
FileStatus[] dirs;
dirs = fs.globStatus(fs.makeQualified(getInputPath()));
for (int i = 0; (dirs != null && i < dirs.length); i++) {
files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
// We only check one file, so exit the loop when we have at least
// one.
if (files.size() > 0) {
break;
}
}
ParquetMetadata parquetMetadata;
try {
parquetMetadata =
ParquetFileReader.readFooter(job.getConfiguration(),
fs.makeQualified(files.get(0).getPath()));
} catch (IOException e) {
LOG.error("Wrong file format. Please check the export file's format.", e);
throw e;
}
MessageType schema = parquetMetadata.getFileMetaData().getSchema();
Schema avroSchema = new AvroSchemaConverter().convert(schema);
DatasetDescriptor descriptor =
new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
.compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
return descriptor;
}
示例14
@Override
public MessageType convertSchema(JsonArray inputSchema, WorkUnitState workUnit)
throws SchemaConversionException {
String fieldName = workUnit.getExtract().getTable();
JsonSchema jsonSchema = new JsonSchema(inputSchema);
jsonSchema.setColumnName(fieldName);
recordConverter = new RecordConverter(jsonSchema, ROOT);
return (MessageType) recordConverter.schema();
}
示例15
private void testCase(String testCaseName)
throws SchemaConversionException, DataConversionException {
JsonObject test = testCases.get(testCaseName).getAsJsonObject();
parquetConverter = new JsonIntermediateToParquetGroupConverter();
MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
Group record =
parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}
示例16
@Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Symbol .* does not belong to set \\[.*?\\]")
public void testEnumTypeBelongsToEnumSet()
throws Exception {
JsonObject test = deepCopy(testCases.get("enum").getAsJsonObject(), JsonObject.class);
parquetConverter = new JsonIntermediateToParquetGroupConverter();
MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
JsonObject jsonRecord = test.get("record").getAsJsonObject();
jsonRecord.addProperty("some_enum", "HELL");
parquetConverter.convertRecord(schema, jsonRecord, workUnit).iterator().next();
}
示例17
private MessageType createParquetSchema() {
List<Type> types = new ArrayList<>();
for ( IParquetOutputField outputField : outputFields ) {
types.add( convertToPrimitiveType( outputField ) );
}
if ( types.isEmpty() ) {
throw new IllegalArgumentException( "Schema should contain at least one field" );
}
return new MessageType( "parquet-schema", types );
}
示例18
public static List<IParquetInputField> buildInputFields( MessageType schema ) {
List<IParquetInputField> inputFields = new ArrayList<>();
for ( Type type : schema.getFields() ) {
if ( type.isPrimitive() ) {
inputFields.add( convertField( type ) );
}
}
return inputFields;
}
示例19
private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) {
String name = Strings.repeat(".", depth) + type.getName();
Repetition rep = type.getRepetition();
int fcount = type.getFieldCount();
out.format("%s: %s F:%d%n", name, rep, fcount);
cpath.add(type.getName());
for (Type ftype : type.getFields()) {
showDetails(out, ftype, depth + 1, container, cpath);
}
cpath.remove(cpath.size() - 1);
}
示例20
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) {
if (type instanceof GroupType) {
showDetails(out, type.asGroupType(), depth, container, cpath);
return;
} else if (type instanceof PrimitiveType) {
showDetails(out, type.asPrimitiveType(), depth, container, cpath);
return;
}
}
示例21
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
Path inpath = new Path(input);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
MessageType schema = metaData.getFileMetaData().getSchema();
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
.withColumnPadding(1)
.withMaxBufferedLines(1000000)
.withFlushOnTab()
.build();
boolean showmd = !options.hasOption('m');
boolean showdt = !options.hasOption('d');
Set<String> showColumns = null;
if (options.hasOption('c')) {
String[] cols = options.getOptionValues('c');
showColumns = new HashSet<String>(Arrays.asList(cols));
}
dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
示例22
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file)
throws IOException
{
FileStatus fileStatus = fileSystem.getFileStatus(file);
try (FSDataInputStream inputStream = fileSystem.open(file)) {
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
long length = fileStatus.getLen();
validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
inputStream.seek(metadataLengthIndex);
int metadataLength = readIntLittleEndian(inputStream);
byte[] magic = new byte[MAGIC.length];
inputStream.readFully(magic);
validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
long metadataIndex = metadataLengthIndex - metadataLength;
validateParquet(
metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
"Corrupted Parquet file: %s metadata index: %s out of range",
file,
metadataIndex);
inputStream.seek(metadataIndex);
FileMetaData fileMetaData = readFileMetaData(inputStream);
List<SchemaElement> schema = fileMetaData.getSchema();
validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
MessageType messageType = readParquetSchema(schema);
List<BlockMetaData> blocks = new ArrayList<>();
List<RowGroup> rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
validateParquet(
(filePath == null && columnChunk.getFile_path() == null)
|| (filePath != null && filePath.equals(columnChunk.getFile_path())),
"all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
ColumnPath columnPath = ColumnPath.get(path);
ColumnChunkMetaData column = ColumnChunkMetaData.get(
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
CompressionCodecName.fromParquet(metaData.codec),
readEncodings(metaData.encodings),
readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()),
metaData.data_page_offset,
metaData.dictionary_page_offset,
metaData.num_values,
metaData.total_compressed_size,
metaData.total_uncompressed_size);
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<>();
List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
}
示例23
private Optional<ConnectorPageSource> createParaflowPageSource(
Path path,
long start,
long length,
List<ParaflowColumnHandle> columns)
{
Optional<FileSystem> fileSystemOptional = fsFactory.getFileSystem();
FileSystem fileSystem;
ParquetDataSource dataSource;
if (fileSystemOptional.isPresent()) {
fileSystem = fileSystemOptional.get();
}
else {
throw new RuntimeException("Could not find filesystem for path " + path);
}
try {
dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
// default length is file size, which means whole file is a split
length = dataSource.getSize();
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
List<Type> fields = columns.stream()
.filter(column -> column.getColType() != ParaflowColumnHandle.ColumnType.NOTVALID)
.map(column -> getParquetType(column, fileSchema))
.filter(Objects::nonNull)
.collect(Collectors.toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
blocks.add(block);
}
}
ParquetReader parquetReader = new ParquetReader(
fileSchema,
requestedSchema,
blocks,
dataSource,
typeManager);
return Optional.of(new ParaflowPageSource(
parquetReader,
dataSource,
fileSchema,
requestedSchema,
length,
columns,
typeManager));
}
catch (IOException e) {
log.error(e);
return Optional.empty();
}
}
示例24
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
boolean printColumns = true;
List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
List<Column> columns = new ArrayList<Column>();
for (int i = 0; i < fileStatuses.length; ++i)
{
FileStatus status = fileStatuses[i];
ParquetMetadata metadata = metadatas[i];
MessageType schema = metadata.getFileMetaData().getSchema();
List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();
for (String columnName : columnNames)
{
int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);
columnDescriptors.add(descriptor);
if (printColumns)
{
Column column = new Column();
column.setIndex(fieldIndex);
column.setName(schema.getFieldName(column.getIndex()));
column.setDescriptor(descriptor);
columns.add(column);
}
}
printColumns = false;
readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
}
long time = System.currentTimeMillis();
long rowCount = 0;
long rowGroupCount = 0;
long readerCount = readers.size();
for (ParquetFileReader reader : readers)
{
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null)
{
rowGroupCount ++;
rowCount += pageReadStore.getRowCount();
}
reader.close();
}
LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
return metrics;
}
示例25
public static void setSchema(MessageType schema, Configuration configuration) {
configuration.set(PARQUET_SCHEMA_PROPERTY_NAME, schema.toString());
}
示例26
public static MessageType getSchema(Configuration configuration) {
return MessageTypeParser.parseMessageType(configuration.get(PARQUET_SCHEMA_PROPERTY_NAME));
}
示例27
@Override
public Iterable<Group> convertRecord(MessageType outputSchema, JsonObject inputRecord, WorkUnitState workUnit)
throws DataConversionException {
return new SingleRecordIterable<>((Group) recordConverter.convert(inputRecord));
}
示例28
/**
* Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
* @param writerConfiguration
* @return
* @throws IOException
*/
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
throws IOException {
CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
.fromString(writerConfiguration.getWriterVersion());
Configuration conf = new Configuration();
ParquetWriter versionSpecificWriter = null;
switch (writerConfiguration.getRecordFormat()) {
case GROUP: {
GroupWriteSupport.setSchema((MessageType) this.schema, conf);
WriteSupport support = new GroupWriteSupport();
versionSpecificWriter = new ParquetWriter<Group>(
writerConfiguration.getAbsoluteStagingFile(),
support,
codecName,
writerConfiguration.getBlockSize(),
writerConfiguration.getPageSize(),
writerConfiguration.getDictPageSize(),
writerConfiguration.isDictionaryEnabled(),
writerConfiguration.isValidate(),
writerVersion,
conf);
break;
}
case AVRO: {
versionSpecificWriter = new AvroParquetWriter(
writerConfiguration.getAbsoluteStagingFile(),
(Schema) this.schema,
codecName,
writerConfiguration.getBlockSize(),
writerConfiguration.getPageSize(),
writerConfiguration.isDictionaryEnabled(),
conf);
break;
}
case PROTOBUF: {
versionSpecificWriter = new ProtoParquetWriter(
writerConfiguration.getAbsoluteStagingFile(),
(Class<? extends Message>) this.schema,
codecName,
writerConfiguration.getBlockSize(),
writerConfiguration.getPageSize(),
writerConfiguration.isDictionaryEnabled(),
writerConfiguration.isValidate());
break;
}
default: throw new RuntimeException("Record format not supported");
}
ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;
return new ParquetWriterShim() {
@Override
public void write(Object record)
throws IOException {
finalVersionSpecificWriter.write(record);
}
@Override
public void close()
throws IOException {
finalVersionSpecificWriter.close();
}
};
}
示例29
@Override
public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
MessageType schema, ReadContext context) {
return new GroupRecordConverter(schema);
}
示例30
@Override
public RecordMaterializer<RowMetaAndData> prepareForRead( Configuration configuration,
Map<String, String> keyValueMetaData,
MessageType fileSchema, ReadContext readContext ) {
return new ParquetConverter.MyRecordMaterializer( converter );
}