data-model-generator
Data model generator based on Scala case classes.
Table of contents
Goals
- Generate data model (e.g. DDL, avro schema, Elasticsearch mapping) based on Scala case classes
Getting started
Include dependency:
"com.github.piotr-kalanski" % "data-model-generator_2.11" % "0.7.7"
or
<dependency>
<groupId>com.github.piotr-kalanski</groupId>
<artifactId>data-model-generator_2.11</artifactId>
<version>0.7.7</version>
</dependency>
Dialects
H2 dialect
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object H2Example extends App {
println(DataModelGenerator.generate[Book](dialects.H2Dialect))
}
CREATE TABLE Book(
title VARCHAR,
year INT,
owner OTHER,
authors ARRAY
);
Hive dialect
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object HiveExample extends App {
println(DataModelGenerator.generate[Book](dialects.HiveDialect))
}
CREATE TABLE Book(
title STRING,
year INT,
owner STRUCT<name : STRING, age : INT>,
authors ARRAY<STRUCT<name : STRING, age : INT>>
);
Redshift dialect
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object RedshiftExample extends App {
println(DataModelGenerator.generate[Book](dialects.RedshiftDialect))
}
CREATE TABLE Book(
title VARCHAR,
year INTEGER,
owner VARCHAR,
authors VARCHAR
);
MySQL dialect
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object MySQLExample extends App {
println(DataModelGenerator.generate[Book](dialects.MySQLDialect))
}
CREATE TABLE Book(
title VARCHAR,
year INTEGER,
owner JSON,
authors JSON
);
Avro schema dialect
Avro schema
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
DataModelGenerator.generate[Book](dialects.AvroSchemaDialect)
{
"namespace": "com.datawizards.dmg.examples",
"type": "record",
"name": "Book",
"fields": [
{"name": "title", "type": "string"},
{"name": "year", "type": "int"},
{"name": "owner", "type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]},
{"name": "authors", "type": "array", "items": {"type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]}}
]
}
Avro schema for Avro Schema Registry
case class Person(name: String, age: Int, skills: Seq[String])
DataModelGenerator.generate[Person](dialects.AvroSchemaRegistryDialect)
{"schema":
"{
\"namespace\": \"com.datawizards.dmg.examples\",
\"type\": \"record\",
\"name\": \"Person\",
\"fields\": [
{\"name\": \"name\", \"type\": \"string\"},
{\"name\": \"age\", \"type\": \"int\"},
{\"name\": \"skills\", \"type\": \"array\", \"items\": \"string\"}
]
}"
}
Elasticsearch dialect
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
DataModelGenerator.generate[Book](dialects.ElasticsearchDialect)
{
"mappings" : {
"Book" : {
"properties" : {
"title" : {"type" : "string"},
"year" : {"type" : "integer"},
"owner" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
},
"authors" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
}
}
}
Java dialect
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.Java)
public class Person {
private String name;
private Integer age;
public Person() {}
public Person(String name, Integer age) {
this.name = name;
this.age = age;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getAge() {
return age;
}
public void setAge(Integer age) {
this.age = age;
}
}
Installers
Library enables installing generated data model at target data store e.g. registering generated avro schema at Avro Schema Registry, creating Elasticsearch index or creating Hive table.
Register Avro schema to Avro schema registry
import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl
case class Person(name: String, age: Int)
object RegisterAvroSchema extends App {
val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
service.registerSchema[Person]("person")
println("Subjects:")
println(service.subjects())
println("Registered schema:")
println(service.fetchSchema("person"))
}
"Subjects:"
["person"]
"Registered schema:"
{"type":"record","name":"Person","namespace":"com.datawizards.dmg.examples","fields":[{"name":"name","type":"string"},{"name":"age","type":"int"}]}
Copy Avro schema to HDFS
import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl
case class Person(name: String, age: Int)
object CopyAvroSchemaToHDFS extends App {
val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
service.copyAvroSchemaToHdfs[Person]("/metadata/schemas/person")
}
Create Elasticsearch index
import com.datawizards.dmg.service.ElasticsearchServiceImpl
case class Person(name: String, age: Int)
object CreateElasticsearchIndex extends App {
val service = new ElasticsearchServiceImpl("http://localhost:9200")
service.createIndex[Person]("person")
println("Index:")
println(service.getIndexSettings("person"))
}
Create Elasticsearch template
import com.datawizards.dmg.examples.TestModel.PersonWithMultipleEsAnnotations
import com.datawizards.dmg.service.ElasticsearchServiceImpl
object CreateElasticsearchTemplate extends App {
val service = new ElasticsearchServiceImpl("http://localhost:9200")
service.updateTemplate[PersonWithMultipleEsAnnotations]("people")
println("Template:")
println(service.getTemplate("people"))
}
Create Hive table
import com.datawizards.dmg.service.HiveServiceImpl
HiveServiceImpl.createHiveTable[Person]()
Extracting class metadata
To extract class metadata you can use method MetaDataWithDialectExtractor.extractClassMetaDataForDialect
. Example:
MetaDataWithDialectExtractor.extractClassMetaDataForDialect[T](Some(dialects.HiveDialect))
Customizations
Custom column name
import com.datawizards.dmg.annotations._
case class Person(
@column(name="personName")
name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE Person(
personName VARCHAR,
age INT
);
Custom column name specific for dialect
import com.datawizards.dmg.annotations._
case class Person(
@column(name="NAME")
@column(name="personName", dialects.ElasticsearchDialect)
name: String,
@column(name="AGE")
@column(name="personAge", dialects.ElasticsearchDialect)
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
CREATE TABLE PEOPLE(
NAME VARCHAR,
AGE INT
);
{
"mappings" : {
"person" : {
"personName" : {"type" : "string"},
"personAge" : {"type" : "integer"}
}
}
}
Custom table name
import com.datawizards.dmg.annotations._
@table("PEOPLE")
case class Person(
name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE PEOPLE(
name VARCHAR,
age INT
);
Custom table name specific for dialect
import com.datawizards.dmg.annotations._
@table("PEOPLE")
@table("person", dialects.ElasticsearchDialect)
case class Person(
name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
CREATE TABLE PEOPLE(
name VARCHAR,
age INT
);
{
"mappings" : {
"person" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
Placeholders
data-model-generator supports placeholder variables when generating data model. Placeholder variables can be used in any annotation.
Example use case for placeholder variables is to use them for generating table name dependent on environment. For example, each environment has dedicated DB schema e.g. development, uat, production.
@table("${environment}.people")
case class Person(
name: String,
age: Int
)
TemplateHandler.inflate(DataModelGenerator.generate[Person](H2Dialect), Map("environment" -> "development"))
CREATE TABLE development.people(
name VARCHAR,
age INT
);
TemplateHandler.inflate(DataModelGenerator.generate[Person](H2Dialect), Map("environment" -> "production"))
CREATE TABLE production.people(
name VARCHAR,
age INT
);
Documentation comments
@comment("People data")
case class PersonWithComments(
@comment("Person name") name: String,
age: Int
)
H2
DataModelGenerator.generate[PersonWithComments](dialects.H2Dialect)
CREATE TABLE PersonWithComments(
name VARCHAR COMMENT 'Person name',
age INT
);
COMMENT ON TABLE PersonWithComments IS 'People data';
Hive
DataModelGenerator.generate[PersonWithComments](dialects.HiveDialect)
CREATE TABLE PersonWithComments(
name STRING COMMENT 'Person name',
age INT
)
COMMENT 'People data';
Redshift
DataModelGenerator.generate[PersonWithComments](dialects.RedshiftDialect)
CREATE TABLE PersonWithComments(
name VARCHAR,
age INTEGER
);
COMMENT ON TABLE PersonWithComments IS 'People data';
COMMENT ON COLUMN PersonWithComments.name IS 'Person name';
Avro schema
DataModelGenerator.generate[PersonWithComments](dialects.AvroSchemaDialect)
{
"namespace": "com.datawizards.dmg.examples",
"type": "record",
"name": "PersonWithComments",
"doc": "People data",
"fields": [
{"name": "name", "type": "string", "doc": "Person name"},
{"name": "age", "type": "int"}
]
}
Column length
import com.datawizards.dmg.annotations._
case class Person(
@length(1000) name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE PEOPLE(
name VARCHAR(1000),
age INT
);
Not null
import com.datawizards.dmg.annotations._
case class Person(
@notNull name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.RedshiftDialect)
DataModelGenerator.generate[Person](dialects.AvroSchemaDialect)
H2 - not null
CREATE TABLE PersonWithNull(
name VARCHAR NOT NULL,
age INT
);
Redshift - not null
CREATE TABLE PersonWithNull(
name VARCHAR NOT NULL,
age INTEGER
);
Avro schema - not null
{
"namespace": "com.datawizards.dmg",
"type": "record",
"name": "PersonWithNull",
"fields": [
{"name": "name", "type": "string"},
{"name": "age", "type": ["null", "int"]}
]
}
Underscore
Convert table and column names for selected dialect to underscore convention.
@underscore(dialect=dialects.H2Dialect)
case class PersonWithUnderscore(
personName: String,
personAge: Int
)
CREATE TABLE person_with_underscore(
person_name VARCHAR,
person_age INT
);
Hive customizations
Hive external table
@hiveExternalTable(location="hdfs:///data/people")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE EXTERNAL TABLE Person(
name STRING,
age INT
)
LOCATION 'hdfs:///data/people';
Hive ROW FORMAT SERDE
@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
name STRING,
age INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe';
Hive STORED AS
@hiveStoredAs(format="PARQUET")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
name STRING,
age INT
)
STORED AS PARQUET;
Hive TABLE PROPERTIES
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
name STRING,
age INT
)
TBLPROPERTIES(
'key1' = 'value1',
'key2' = 'value2',
'key3' = 'value3'
);
Hive avro schema url property
@hiveTableProperty("avro.schema.url", "hdfs:///metadata/person.avro")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
If "avro.schema.url" table property is provided then generated data model doesn't have any columns definitions, because they are taken by Hive from avro schema.
CREATE TABLE Person
TBLPROPERTIES(
'avro.schema.url' = 'hdfs:///metadata/person.avro'
);
Hive partition columns
case class ClicksPartitioned(
time: Timestamp,
event: String,
user: String,
@hivePartitionColumn
year: Int,
@hivePartitionColumn
month: Int,
@hivePartitionColumn
day: Int
)
DataModelGenerator.generate[ClicksPartitioned](dialects.HiveDialect)
CREATE TABLE ClicksPartitioned(
time TIMESTAMP,
event STRING,
user STRING
)
PARTITIONED BY(year INT, month INT, day INT);
Hive partition columns - order
case class ClicksPartitioned(
time: Timestamp,
event: String,
user: String,
@hivePartitionColumn(order=3)
day: Int,
@hivePartitionColumn(order=1)
year: Int,
@hivePartitionColumn(order=2)
month: Int
)
DataModelGenerator.generate[ClicksPartitionedWithOrder](dialects.HiveDialect)
CREATE TABLE ClicksPartitionedWithOrder(
time TIMESTAMP,
event STRING,
user STRING
)
PARTITIONED BY(year INT, month INT, day INT);
Hive Parquet table with many annotations
@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveStoredAs(format="PARQUET")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class ParquetTableWithManyAnnotations(
@column("eventTime")
@comment("Event time")
time: Timestamp,
@comment("Event name")
event: String,
@comment("User id")
user: String,
@hivePartitionColumn(order=3)
day: Int,
@hivePartitionColumn(order=1)
year: Int,
@hivePartitionColumn(order=2)
month: Int
)
DataModelGenerator.generate[ParquetTableWithManyAnnotations](dialects.HiveDialect)
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME(
eventTime TIMESTAMP COMMENT 'Event time',
event STRING COMMENT 'Event name',
user STRING COMMENT 'User id'
)
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
STORED AS PARQUET
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
'key1' = 'value1',
'key2' = 'value2',
'key3' = 'value3'
);
Hive Avro table with many annotations
@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
@hiveStoredAs("INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("avro.schema.url", "hdfs:///metadata/table.avro")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class AvroTableWithManyAnnotations(
@column("eventTime")
@comment("Event time")
time: Timestamp,
@comment("Event name")
event: String,
@comment("User id")
user: String,
@hivePartitionColumn(order=3)
day: Int,
@hivePartitionColumn(order=1)
year: Int,
@hivePartitionColumn(order=2)
month: Int
)
DataModelGenerator.generate[AvroTableWithManyAnnotations](dialects.HiveDialect)
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
'avro.schema.url' = 'hdfs:///metadata/table.avro',
'key1' = 'value1',
'key2' = 'value2',
'key3' = 'value3'
);
Elasticsearch customizations
index settings
@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
@esSetting("blocks.read_only", true)
@esSetting("codec", "best_compression")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"settings" : {
"number_of_shards" : 1,
"number_of_replicas" : 3,
"blocks.read_only" : "true",
"codec" : "best_compression"
},
"mappings" : {
"Person" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
}
index parameter
Index parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-index.html
case class Person(
@esIndex("not_analyzed") name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"mappings" : {
"PersonEsIndexSettings" : {
"properties" : {
"name" : {"type" : "string", "index" : "not_analyzed"},
"age" : {"type" : "integer"}
}
}
}
}
format parameter
Date format parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html
case class Person(
name: String,
@esFormat("yyyy-MM-dd") birthday: Date
)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"mappings" : {
"Person" : {
"properties" : {
"name" : {"type" : "string"},
"birthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
}
}
}
}
Elasticsearch template
https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-templates.html
@esTemplate("people*")
case class PersonWithEsTemplate(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"template" : "people*",
"mappings" : {
"PersonWithEsTemplate" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
}
Elasticsearch multiple annotations
@table("people")
@esTemplate("people*")
@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
case class PersonWithMultipleEsAnnotations(
@esIndex("not_analyzed")
@column("personName")
name: String,
@column("personBirthday")
@esFormat("yyyy-MM-dd")
birthday: Date
)
DataModelGenerator.generate[PersonWithMultipleEsAnnotations](dialects.ElasticsearchDialect)
{
"template" : "people*",
"settings" : {
"number_of_shards" : 1,
"number_of_replicas" : 3
},
"mappings" : {
"people" : {
"properties" : {
"personName" : {"type" : "string", "index" : "not_analyzed"},
"personBirthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
}
}
}
}