comparing_parquet_files
155 lines
nic@xps-15:~/arrow$ parquet-tools inspect ../Downloads/papers.parquet
nic@xps-15:~/arrow$ parquet-tools inspect "/tmp/RtmpfoyxmB/file18fa6b312836/part-0.parquet"
############ file meta data ############
############ file meta data ############
created_by: parquet-go version 18.0.0-SNAPSHOT
created_by: parquet-cpp-arrow version 20.0.0-SNAPSHOT
num_columns: 13
num_columns: 13
num_rows: 64141
num_rows: 64141
num_row_groups: 1
num_row_groups: 1
format_version: 2.6
format_version: 2.6
serialized_size: 1819
serialized_size: 3124
############ Columns ############
############ Columns ############
paper_id
paper_id
softcite_id
softcite_id
title
title
published_year
published_year
published_date
published_date
publication_venue
publication_venue
publisher_name
publisher_name
doi
doi
pmcid
pmcid
pmid
pmid
genre
genre
license_type
license_type
has_mentions
has_mentions
############ Column(paper_id) ############
############ Column(paper_id) ############
name: paper_id
name: paper_id
path: paper_id
path: paper_id
max_definition_level: 0
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: INT32
physical_type: INT32
logical_type: Int(bitWidth=32, isSigned=false)
logical_type: Int(bitWidth=32, isSigned=false)
converted_type (legacy): UINT_32
converted_type (legacy): UINT_32
compression: GZIP (space_saved: 13%)
compression: GZIP (space_saved: 22%)
############ Column(softcite_id) ############
############ Column(softcite_id) ############
name: softcite_id
name: softcite_id
path: softcite_id
path: softcite_id
max_definition_level: 0
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 50%)
compression: GZIP (space_saved: 47%)
############ Column(title) ############
############ Column(title) ############
name: title
name: title
path: title
path: title
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 56%)
compression: GZIP (space_saved: 55%)
############ Column(published_year) ############
############ Column(published_year) ############
name: published_year
name: published_year
path: published_year
path: published_year
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: INT32
physical_type: INT32
logical_type: Int(bitWidth=16, isSigned=false)
logical_type: Int(bitWidth=16, isSigned=false)
converted_type (legacy): UINT_16
converted_type (legacy): UINT_16
compression: GZIP (space_saved: 18%)
compression: GZIP (space_saved: 18%)
############ Column(published_date) ############
############ Column(published_date) ############
name: published_date
name: published_date
path: published_date
path: published_date
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: INT32
physical_type: INT32
logical_type: Date
logical_type: Date
converted_type (legacy): DATE
converted_type (legacy): DATE
compression: GZIP (space_saved: 10%)
compression: GZIP (space_saved: 10%)
############ Column(publication_venue) ############
############ Column(publication_venue) ############
name: publication_venue
name: publication_venue
path: publication_venue
path: publication_venue
max_definition_level: 0
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 59%)
compression: GZIP (space_saved: 59%)
############ Column(publisher_name) ############
############ Column(publisher_name) ############
name: publisher_name
name: publisher_name
path: publisher_name
path: publisher_name
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 49%)
compression: GZIP (space_saved: 48%)
############ Column(doi) ############
############ Column(doi) ############
name: doi
name: doi
path: doi
path: doi
max_definition_level: 0
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 61%)
compression: GZIP (space_saved: 59%)
############ Column(pmcid) ############
############ Column(pmcid) ############
name: pmcid
name: pmcid
path: pmcid
path: pmcid
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 63%)
compression: GZIP (space_saved: 63%)
############ Column(pmid) ############
############ Column(pmid) ############
name: pmid
name: pmid
path: pmid
path: pmid
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 57%)
compression: GZIP (space_saved: 56%)
############ Column(genre) ############
############ Column(genre) ############
name: genre
name: genre
path: genre
path: genre
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 60%)
compression: GZIP (space_saved: 56%)
############ Column(license_type) ############
############ Column(license_type) ############
name: license_type
name: license_type
path: license_type
path: license_type
max_definition_level: 1
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BYTE_ARRAY
physical_type: BYTE_ARRAY
logical_type: String
logical_type: String
converted_type (legacy): UTF8
converted_type (legacy): UTF8
compression: GZIP (space_saved: 49%)
compression: GZIP (space_saved: 45%)
############ Column(has_mentions) ############
############ Column(has_mentions) ############
name: has_mentions
name: has_mentions
path: has_mentions
path: has_mentions
max_definition_level: 0
max_definition_level: 1
max_repetition_level: 0
max_repetition_level: 0
physical_type: BOOLEAN
physical_type: BOOLEAN
logical_type: None
logical_type: None
converted_type (legacy): NONE
converted_type (legacy): NONE
compression: GZIP (space_saved: 99%)
compression: GZIP (space_saved: 99%)