google-cloud/src/main/java/io/cdap/plugin/gcp/bigquery/sqlengine/BigQuerySparkDatasetProducer.java at dee2b2ddaded94da295cbd0fcdd826ddda9e2bc5 · data-integrations/google-cloud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/*
 * Copyright © 2021 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.bigquery.sqlengine;

import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection;
import io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset;
import io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetDescription;
import io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetProducer;
import io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import javax.annotation.Nullable;

/**
 * Dataset Producer implementation which uses the Spark-BigQuery connector to extract records.
 */
public class BigQuerySparkDatasetProducer
  implements SQLDatasetProducer, Serializable {

  private static final Logger LOG = LoggerFactory.getLogger(BigQuerySparkDatasetProducer.class);

  private static final String FORMAT = "bigquery";
  private static final String CONFIG_CREDENTIALS_FILE = "credentialsFile";
  private static final String CONFIG_CREDENTIALS = "credentials";

  private BigQuerySQLEngineConfig config;
  private String project;
  private String bqDataset;
  private String bqTable;
  private Schema schema;


  public BigQuerySparkDatasetProducer(BigQuerySQLEngineConfig config,
                                      String project,
                                      String bqDataset,
                                      String bqTable,
                                      Schema schema) {
    this.config = config;
    this.project = project;
    this.bqDataset = bqDataset;
    this.bqTable = bqTable;
    this.schema = schema;
  }

  @Override
  public SQLDatasetDescription getDescription() {
    return null;
  }

  @Override
  @Nullable
  public RecordCollection produce(SQLDataset sqlDataset) {
    // Define which table to load.
    String path = String.format("%s.%s.%s", project, bqDataset, bqTable);

    // Create Spark context to use for this operation.
    SparkContext sc = SparkContext.getOrCreate();
    SparkSession spark = SparkSession.builder()
      .appName("spark-bq-connector-reader")
      .sparkContext(sc)
      .getOrCreate();

    DataFrameReader bqReader = spark.read().format(FORMAT);

    // Set credential file path or base64-encoded credential from json.
    if (Boolean.TRUE.equals(config.isServiceAccountFilePath()) && config.getServiceAccountFilePath() != null) {
      bqReader.option(CONFIG_CREDENTIALS_FILE, config.getServiceAccountFilePath());
    } else if (Boolean.TRUE.equals(config.isServiceAccountJson()) && config.getServiceAccountJson() != null) {
      bqReader.option(CONFIG_CREDENTIALS, encodeBase64(config.getServiceAccountJson()));
    }

    // Load path into dataset.
    Dataset<Row> ds = bqReader.load(path);
    ds = convertFieldTypes(ds);

    return new SparkRecordCollectionImpl(ds);
  }


  private String encodeBase64(String serviceAccountJson) {
    return Base64.getEncoder().encodeToString(serviceAccountJson.getBytes(StandardCharsets.UTF_8));
  }

  /**
   * Adjust CDAP types for int and float fields.
   *
   * @param ds input dataframe
   * @return dataframe with updated schema.
   */
  private Dataset<Row> convertFieldTypes(Dataset<Row> ds) {
    for (Schema.Field field : schema.getFields()) {
      String fieldName = field.getName();
      Schema fieldSchema = field.getSchema();

      // For nullable types, check the underlying type.
      if (fieldSchema.isNullable()) {
        fieldSchema = fieldSchema.getNonNullable();
      }

      // Handle Int types
      if (fieldSchema.getType() == Schema.Type.INT) {
        LOG.trace("Converting field {} to Integer", fieldName);
        ds = ds.withColumn(fieldName, ds.col(fieldName).cast(DataTypes.IntegerType));
      }

      // Handle float types
      if (fieldSchema.getType() == Schema.Type.FLOAT) {
        LOG.trace("Converting field {} to Float", fieldName);
        ds = ds.withColumn(fieldName, ds.col(fieldName).cast(DataTypes.FloatType));
      }
    }

    return ds;
  }

}