/*******************************************************************************
* Copyright 2014-2020 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file ComputeProlongation_ref.cpp

 HPCG routine
 */

#include "ComputeProlongation_ref.hpp"
#include "UsmUtil.hpp"

/*!
  Routine to compute the coarse residual vector.

  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c operator.
  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.

  Note that the fine grid residual is never explicitly constructed.
  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.

  @return Returns zero on success and a non-zero value otherwise.
*/
int ComputeProlongation_ref(const SparseMatrix & Af, Vector & xf) {

  double * xfv = xf.values;
  double * xcv = Af.mgData_host->xc->values;
  local_int_t * f2c = Af.mgData_host->f2cOperator;
  local_int_t nc = Af.mgData_host->rc->localLength;

// TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
#pragma ivdep
  for (local_int_t i=0; i<nc; ++i) xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize

  return 0;
}

sycl::event ComputeProlongation(const SparseMatrix & Af, Vector & xf, sycl::queue & main_queue,
                                int& ierr, const std::vector<sycl::event> & deps) {

  double * xfv = xf.values;
  double * xcv = Af.mgData->xc->values;
  local_int_t * f2c = Af.mgData->f2cOperator;
  local_int_t nc = Af.mgData->rc->localLength;

  struct optData *optData = (struct optData *)Af.optimizationData;
  local_int_t *perm_fine = optData->perm;
  local_int_t *perm_coarse = optData->perm_coarse;

  //DPCPP ComputeProlongationKernel
  return main_queue.submit([&](sycl::handler &cgh) {
    const local_int_t total_size = round_up_next_multiple(nc, 256);
    cgh.depends_on(deps);
    auto kernel = [=] (sycl::nd_item<1> item) {
      local_int_t id = item.get_global_id(0); // f2c maps unpermuted fine to coarse, must add permutations for r

      if(id<nc) {
          local_int_t row_fine = perm_fine[f2c[id]];
          local_int_t row_coarse = perm_coarse[id];
          xfv[row_fine] += xcv[row_coarse];
      }
    };
    cgh.parallel_for<class ComputeProlongationClass>(sycl::nd_range<1>(total_size, 256), kernel);
  });
}
