/*
 * Decompiled with CFR 0.152.
 */
package org.apache.spark.examples.ml;

import java.util.Arrays;
import java.util.List;
import org.apache.spark.ml.feature.MinHashLSH;
import org.apache.spark.ml.feature.MinHashLSHModel;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class JavaMinHashLSHExample {
    public static void main(String[] args) {
        SparkSession spark = SparkSession.builder().appName("JavaMinHashLSHExample").getOrCreate();
        List<Row> dataA = Arrays.asList(RowFactory.create((Object[])new Object[]{0, Vectors.sparse((int)6, (int[])new int[]{0, 1, 2}, (double[])new double[]{1.0, 1.0, 1.0})}), RowFactory.create((Object[])new Object[]{1, Vectors.sparse((int)6, (int[])new int[]{2, 3, 4}, (double[])new double[]{1.0, 1.0, 1.0})}), RowFactory.create((Object[])new Object[]{2, Vectors.sparse((int)6, (int[])new int[]{0, 2, 4}, (double[])new double[]{1.0, 1.0, 1.0})}));
        List<Row> dataB = Arrays.asList(RowFactory.create((Object[])new Object[]{0, Vectors.sparse((int)6, (int[])new int[]{1, 3, 5}, (double[])new double[]{1.0, 1.0, 1.0})}), RowFactory.create((Object[])new Object[]{1, Vectors.sparse((int)6, (int[])new int[]{2, 3, 5}, (double[])new double[]{1.0, 1.0, 1.0})}), RowFactory.create((Object[])new Object[]{2, Vectors.sparse((int)6, (int[])new int[]{1, 2, 4}, (double[])new double[]{1.0, 1.0, 1.0})}));
        StructType schema = new StructType(new StructField[]{new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", (DataType)new VectorUDT(), false, Metadata.empty())});
        Dataset dfA = spark.createDataFrame(dataA, schema);
        Dataset dfB = spark.createDataFrame(dataB, schema);
        int[] indices = new int[]{1, 3};
        double[] values = new double[]{1.0, 1.0};
        Vector key = Vectors.sparse((int)6, (int[])indices, (double[])values);
        MinHashLSH mh = new MinHashLSH().setNumHashTables(5).setInputCol("features").setOutputCol("hashes");
        MinHashLSHModel model = (MinHashLSHModel)mh.fit(dfA);
        System.out.println("The hashed dataset where hashed values are stored in the column 'hashes':");
        model.transform(dfA).show();
        System.out.println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:");
        model.approxSimilarityJoin(dfA, dfB, 0.6, "JaccardDistance").select(new Column[]{functions.col((String)"datasetA.id").alias("idA"), functions.col((String)"datasetB.id").alias("idB"), functions.col((String)"JaccardDistance")}).show();
        System.out.println("Approximately searching dfA for 2 nearest neighbors of the key:");
        model.approxNearestNeighbors(dfA, key, 2).show();
        spark.stop();
    }
}

