Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.client.builder.DatabaseBuilder;
import org.apache.hadoop.hive.metastore.client.builder.TableBuilder;
import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat;
import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
Expand Down Expand Up @@ -60,6 +61,8 @@ public class TestStorageSchemaReader {
@Before public void setUp() throws Exception {
dbName = "sampleDb";
hiveConf = new HiveConf(this.getClass());
// Unset the deprecated HiveConf value so MetastoreConf uses its own default
hiveConf.unset(HiveConf.ConfVars.SERDES_USING_METASTORE_FOR_SCHEMA.varname);
new DatabaseBuilder().setName(dbName).create(new HiveMetaStoreClient(hiveConf), hiveConf);
avroTableParams.put("avro.schema.literal",
"{\"name\":\"nullable\", \"type\":\"record\", \"fields\":[{\"name\":\"id\", \"type\":\"int\"}, {\"name\":\"value\", \"type\":\"int\"}]}");
Expand Down Expand Up @@ -111,56 +114,56 @@ private void checkFields(List<FieldSchema> fieldSchemas, List<FieldSchema> field
}

@Test public void testAvroTableWithDefaultSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
String tblName = "avroTable";
createTable(tblName, AvroSerDe.class.getName(), AvroContainerInputFormat.class.getName(),
Table tbl = createTable(tblName, AvroSerDe.class.getName(), AvroContainerInputFormat.class.getName(),
AvroContainerOutputFormat.class.getName(), avroTableParams, new HashMap<>());
assertThrows("Storage schema reading not supported", MetaException.class, () -> client.getSchema(dbName, tblName));
checkSchema(tblName, tbl);
}

@Test public void testAvroTableWithSerdeSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
String tblName = "avroTable";
Table tbl = createTable(tblName, AvroSerDe.class.getName(), AvroContainerInputFormat.class.getName(),
AvroContainerOutputFormat.class.getName(), avroTableParams, new HashMap<>());
checkSchema(tblName, tbl);
}

@Test public void testHbaseTableWithDefaultSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
String tblName = "jdbcTable";
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
String tblName = "hbaseTable";

createTable(tblName, HBaseSerDe.class.getName(), null, null, hbaseTableParams, hbaseSerdeParams);
assertThrows("Storage schema reading not supported", MetaException.class, () -> client.getSchema(dbName, tblName));
Table table = createTable(tblName, HBaseSerDe.class.getName(), null, null, hbaseTableParams, hbaseSerdeParams);
checkSchema(tblName, table);
}

@Test public void testHbaseTableWithSerdeSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
String tblName = "jdbcTable";
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
String tblName = "hbaseTableSerde";

Table table =
createTable(tblName, "org.apache.hadoop.hive.hbase.HBaseSerDe", null, null, hbaseTableParams, hbaseSerdeParams);
checkSchema(tblName, table);
}

@Test public void testJdbcTableWithDefaultSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
String tblName = "jdbcTable";

createTable(tblName, JdbcSerDe.class.getName(), null, null, jdbcTableParams, jdbcSerdeParams);
assertThrows("Storage schema reading not supported", MetaException.class, () -> client.getSchema(dbName, tblName));
}

@Test public void testJdbcTableWithSerdeSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
String tblName = "jdbcTable";

Table table = createTable(tblName, JdbcSerDe.class.getName(), null, null, jdbcTableParams, jdbcSerdeParams);
checkSchema(tblName, table);
}

@Test public void testOrcTableWithDefaultSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader");
String tblName = "orcTable2";
Table tbl =
createTable(tblName, OrcSerde.class.getName(), OrcInputFormat.class.getName(), OrcOutputFormat.class.getName(),
Expand All @@ -169,7 +172,7 @@ private void checkFields(List<FieldSchema> fieldSchemas, List<FieldSchema> field
}

@Test public void testOrcTableWithSerdeSSR() throws Exception {
hiveConf.set("metastore.storage.schema.reader.impl", "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
MetastoreConf.setVar(hiveConf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, "org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader");
String tblName = "orcTable";
Table tbl =
createTable(tblName, OrcSerde.class.getName(), OrcInputFormat.class.getName(), OrcOutputFormat.class.getName(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1441,7 +1441,8 @@ public enum ConfVars {
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe," +
"org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe," +
"org.apache.hadoop.hive.serde2.OpenCSVSerde," +
"org.apache.iceberg.mr.hive.HiveIcebergSerDe",
"org.apache.iceberg.mr.hive.HiveIcebergSerDe," +
"org.apache.hadoop.hive.hbase.HBaseSerDe",
Comment thread
saihemanth-cloudera marked this conversation as resolved.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some ideas:
This white list expands as we move on, how about:

  • For native tables, we just get the column from Metastore, regardless of what the serde is;
  • For non-native tables, we try to get the column from serde first, if it fails, then get the column from Metastore.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @dengzhhu653! To clarify, do you think it is acceptable to return the 'last known' schema as a fallback for non-native tables (like Avro) from HMS ? If we agree on this 'best-effort' approach implemented in this PR, we can handle the task of removing the whitelist logic in a follow-up PR.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opt for the 'best-effort', so the Metastore cab get ride of serde lib.
For those tables which determine the columns from serde, we can move StorageSchemaReader#readSchema to the client side

"SerDes retrieving schema from metastore. This is an internal parameter."),
SERDES_WITHOUT_FROM_DESERIALIZER("metastore.serdes.without.from.deserializer",
"hive.metastore.serdes.without.from.deserializer",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4271,14 +4271,25 @@ private List<FieldSchema> get_fields_with_environment_context_core(String db, St
} catch (NoSuchObjectException e) {
throw new UnknownTableException(e.getMessage());
}
if (null == tbl.getSd().getSerdeInfo().getSerializationLib() ||
String serdeLib = tbl.getSd().getSerdeInfo().getSerializationLib();
if (serdeLib == null ||
MetastoreConf.getStringCollection(conf,
ConfVars.SERDES_USING_METASTORE_FOR_SCHEMA).contains(
tbl.getSd().getSerdeInfo().getSerializationLib())) {
ConfVars.SERDES_USING_METASTORE_FOR_SCHEMA).contains(serdeLib)) {
ret = tbl.getSd().getCols();
} else {
StorageSchemaReader schemaReader = getStorageSchemaReader();
ret = schemaReader.readSchema(tbl, envContext, getConf());
try {
StorageSchemaReader schemaReader = getStorageSchemaReader();
ret = schemaReader.readSchema(tbl, envContext, getConf());
} catch (Exception e) {
Comment thread
rtrivedi12 marked this conversation as resolved.
Outdated
if ("org.apache.hadoop.hive.serde2.avro.AvroSerDe".equals(serdeLib)) {
LOG.warn("Unable to read schema from storage for AvroSerDe table '{}.{}' ({}). " +
"Returning metastore SD columns as fallback; schema may be stale ",
db, tableName, e.getMessage());
Comment thread
rtrivedi12 marked this conversation as resolved.
Outdated
ret = tbl.getSd().getCols();
} else {
throw new UnsupportedOperationException("Storage schema reading not supported");
Comment thread
rtrivedi12 marked this conversation as resolved.
Outdated
}
}
}
} catch (Exception e) {
ex = e;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3958,4 +3958,72 @@ public void testDropDataConnectorIfNotExistsTrue() throws Exception {
// No such data connector, ignore NoSuchObjectException
client.dropDataConnector("no_such_data_connector", true, false);
}

@Test
public void testGetFieldsForAvroSerDe() throws Exception {
String dbName = "test_avro_serde_db";
String avroTbl = "avro_tbl";

client.dropTable(dbName, avroTbl, true, true);
silentDropDatabase(dbName);

new DatabaseBuilder()
.setName(dbName)
.create(client, conf);

new TableBuilder()
.setDbName(dbName)
.setTableName(avroTbl)
.setSerdeLib("org.apache.hadoop.hive.serde2.avro.AvroSerDe")
.setInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat")
.setOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")
.addCol("foo", "int", "")
.addCol("bar", "string", "")
.addCol("baz", "bigint", "")
.create(client, conf);

List<FieldSchema> fields = client.getFields(dbName, avroTbl);
assertEquals("AvroSerDe table should return 3 fields from metastore", 3, fields.size());
assertEquals("foo", fields.get(0).getName());
assertEquals("bar", fields.get(1).getName());
assertEquals("baz", fields.get(2).getName());

List<FieldSchema> schema = client.getSchema(dbName, avroTbl);
assertEquals("AvroSerDe getSchema should return 3 columns", 3, schema.size());

client.dropTable(dbName, avroTbl, true, true);
client.dropDatabase(dbName);
}

@Test
public void testGetFieldsForHBaseSerDe() throws Exception {
String dbName = "test_hbase_serde_db";
String hbaseTbl = "hbase_tbl";

client.dropTable(dbName, hbaseTbl, true, true);
silentDropDatabase(dbName);

new DatabaseBuilder()
.setName(dbName)
.create(client, conf);

new TableBuilder()
.setDbName(dbName)
.setTableName(hbaseTbl)
.setSerdeLib("org.apache.hadoop.hive.hbase.HBaseSerDe")
.addCol("key", "string", "")
.addCol("value", "string", "")
.create(client, conf);

List<FieldSchema> fields = client.getFields(dbName, hbaseTbl);
assertEquals("HBaseSerDe table should return 2 fields from metastore", 2, fields.size());
assertEquals("key", fields.get(0).getName());
assertEquals("value", fields.get(1).getName());

List<FieldSchema> schema = client.getSchema(dbName, hbaseTbl);
assertEquals("HBaseSerDe getSchema should return 2 columns", 2, schema.size());

client.dropTable(dbName, hbaseTbl, true, true);
client.dropDatabase(dbName);
}
}
Loading