!--------------------------------------------------------------------------------------------------!
!   CP2K: A general program to perform molecular dynamics simulations                              !
!   Copyright 2000-2023 CP2K developers group <https://cp2k.org>                                   !
!                                                                                                  !
!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
!--------------------------------------------------------------------------------------------------!

! **************************************************************************************************
!> \note
!>      If parallel mode is distributed certain combination of
!>      "in_use" and "in_space" can not be used.
!>      For performance reasons it would be better to have the loops
!>      over g-vectros in the gather/scatter routines in new subprograms
!>      with the actual arrays (also the addressing) in the parameter list
!> \par History
!>      JGH (29-Dec-2000) : Changes for parallel use
!>      JGH (13-Mar-2001) : added timing calls
!>      JGH (26-Feb-2003) : OpenMP enabled
!>      JGH (17-Nov-2007) : Removed mass arrays
!>      JGH (01-Dec-2007) : Removed and renamed routines
!>      JGH (04-Jul-2019) : added pw_multiply routine
!>      03.2008 [tlaino] : Splitting pw_types into pw_types and pw_methods
!> \author apsi
! **************************************************************************************************
MODULE pw_methods
   USE cp_log_handling,                 ONLY: cp_logger_get_default_io_unit,&
                                              cp_to_string
   USE fast,                            ONLY: copy_cr,&
                                              copy_rc,&
                                              zero_c
   USE fft_tools,                       ONLY: BWFFT,&
                                              FWFFT,&
                                              fft3d
   USE kahan_sum,                       ONLY: accurate_sum
   USE kinds,                           ONLY: dp
   USE machine,                         ONLY: m_memory
   USE pw_copy_all,                     ONLY: pw_copy_match
   USE pw_fpga,                         ONLY: pw_fpga_c1dr3d_3d_dp,&
                                              pw_fpga_c1dr3d_3d_sp,&
                                              pw_fpga_init_bitstream,&
                                              pw_fpga_r3dc1d_3d_dp,&
                                              pw_fpga_r3dc1d_3d_sp
   USE pw_gpu,                          ONLY: pw_gpu_c1dr3d_3d,&
                                              pw_gpu_c1dr3d_3d_ps,&
                                              pw_gpu_r3dc1d_3d,&
                                              pw_gpu_r3dc1d_3d_ps
   USE pw_grid_types,                   ONLY: HALFSPACE,&
                                              PW_MODE_DISTRIBUTED,&
                                              PW_MODE_LOCAL,&
                                              pw_grid_type
   USE pw_types,                        ONLY: COMPLEXDATA1D,&
                                              COMPLEXDATA3D,&
                                              NOSPACE,&
                                              REALDATA1D,&
                                              REALDATA3D,&
                                              REALSPACE,&
                                              RECIPROCALSPACE,&
                                              pw_type
#include "../base/base_uses.f90"

   IMPLICIT NONE

   PRIVATE

   PUBLIC :: pw_zero, pw_structure_factor, pw_smoothing
   PUBLIC :: pw_copy, pw_axpy, pw_transfer, pw_scale
   PUBLIC :: pw_gauss_damp, pw_compl_gauss_damp, pw_derive, pw_laplace, pw_dr2, pw_write, pw_multiply
   PUBLIC :: pw_gauss_damp_mix
   PUBLIC :: pw_integral_ab, pw_integral_a2b
   PUBLIC :: pw_dr2_gg, pw_integrate_function
   PUBLIC :: pw_set, pw_truncated

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_methods'
   LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
   INTEGER, PARAMETER, PUBLIC  ::  do_accurate_sum = 0, &
                                  do_standard_sum = 1

   INTERFACE pw_gather
      MODULE PROCEDURE pw_gather_s, pw_gather_p
   END INTERFACE

   INTERFACE pw_scatter
      MODULE PROCEDURE pw_scatter_s, pw_scatter_p
   END INTERFACE

CONTAINS

! **************************************************************************************************
!> \brief Set values of a pw type to zero
!> \param pw ...
!> \par History
!>      none
!> \author apsi
! **************************************************************************************************
   SUBROUTINE pw_zero(pw)

      TYPE(pw_type), INTENT(IN)                          :: pw

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_zero'

      INTEGER                                            :: handle, ns
      REAL(KIND=dp)                                      :: zr

      CALL timeset(routineN, handle)
      IF (pw%in_use == REALDATA1D) THEN
         ns = SIZE(pw%cr)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw)
         pw%cr(:) = 0._dp
!$OMP END PARALLEL WORKSHARE
      ELSE IF (pw%in_use == COMPLEXDATA1D) THEN
         ns = SIZE(pw%cc)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw)
         pw%cc(:) = CMPLX(0._dp, 0._dp, KIND=dp)
!$OMP END PARALLEL WORKSHARE
      ELSE IF (pw%in_use == REALDATA3D) THEN
         ns = SIZE(pw%cr3d)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw)
         pw%cr3d(:, :, :) = 0._dp
!$OMP END PARALLEL WORKSHARE
      ELSE IF (pw%in_use == COMPLEXDATA3D) THEN
         ns = SIZE(pw%cc3d)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw)
         pw%cc3d(:, :, :) = CMPLX(0._dp, 0._dp, KIND=dp)
!$OMP END PARALLEL WORKSHARE
      ELSE
         CPABORT("No possible data field!")
      END IF

      zr = REAL(ns, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_zero

! **************************************************************************************************
!> \brief copy a pw type variable
!> \param pw1 ...
!> \param pw2 ...
!> \par History
!>      JGH (7-Mar-2001) : check for pw_grid %id_nr, allow copy if
!>        in_use == COMPLEXDATA1D and in_space == RECIPROCALSPACE
!>      JGH (21-Feb-2003) : Code for generalized reference grids
!> \author apsi
!> \note
!>      Currently only copying of respective types allowed,
!>      in order to avoid errors
! **************************************************************************************************
   SUBROUTINE pw_copy(pw1, pw2)
      TYPE(pw_type), INTENT(IN)                          :: pw1
      TYPE(pw_type), INTENT(INOUT)                       :: pw2

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_copy'

      INTEGER                                            :: handle, i, j, ng, ng1, ng2, ns, out_unit
      REAL(KIND=dp)                                      :: zc

      CALL timeset(routineN, handle)
      out_unit = cp_logger_get_default_io_unit()
      IF (.NOT. ASSOCIATED(pw1%pw_grid, pw2%pw_grid)) THEN

         IF (pw1%pw_grid%spherical .AND. pw2%pw_grid%spherical) THEN

            IF (pw_compatible(pw1%pw_grid, pw2%pw_grid)) THEN

               IF (pw1%in_use == COMPLEXDATA1D .AND. &
                   pw2%in_use == COMPLEXDATA1D .AND. &
                   pw1%in_space == RECIPROCALSPACE) THEN
                  ng1 = SIZE(pw1%cc)
                  ng2 = SIZE(pw2%cc)
                  ng = MIN(ng1, ng2)
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(ng,pw1,pw2)
                  DO i = 1, ng
                     pw2%cc(i) = pw1%cc(i)
                  END DO
                  IF (ng2 > ng) THEN
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(ng,ng2,pw2)
                     DO i = ng + 1, ng2
                        pw2%cc(i) = CMPLX(0.0_dp, 0.0_dp, KIND=dp)
                     END DO
                  END IF
                  ns = 2*MAX(ng1, ng2)
               ELSE
                  CPABORT("No suitable data field")
               END IF

            ELSE
               IF (out_unit > 0) THEN
                  WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                     " grid 1 :", pw1%pw_grid%id_nr, &
                     " spherical :", pw1%pw_grid%spherical, &
                     " reference :", pw1%pw_grid%reference
                  WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                     " grid 2 :", pw2%pw_grid%id_nr, &
                     " spherical :", pw2%pw_grid%spherical, &
                     " reference :", pw2%pw_grid%reference
               END IF
               CPABORT("Incompatible grids")
            END IF

         ELSE IF (.NOT. (pw1%pw_grid%spherical .OR. &
                         pw2%pw_grid%spherical)) THEN

            ng1 = SIZE(pw1%cc)
            ng2 = SIZE(pw2%cc)
            ns = 2*MAX(ng1, ng2)

            IF (pw1%in_use == COMPLEXDATA1D .AND. &
                pw2%in_use == COMPLEXDATA1D .AND. &
                pw1%in_space == RECIPROCALSPACE) THEN

               IF ((pw1%pw_grid%id_nr == pw2%pw_grid%reference)) THEN
                  IF (ng1 >= ng2) THEN
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(ng2,pw1,pw2)
                     DO i = 1, ng2
                        j = pw2%pw_grid%gidx(i)
                        pw2%cc(i) = pw1%cc(j)
                     END DO
                  ELSE
                     CALL pw_zero(pw2)
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(ng1,pw1,pw2)
                     DO i = 1, ng1
                        j = pw2%pw_grid%gidx(i)
                        pw2%cc(j) = pw1%cc(i)
                     END DO
                  END IF
               ELSE IF ((pw2%pw_grid%id_nr == pw1%pw_grid%reference)) THEN
                  IF (ng1 >= ng2) THEN
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw1,pw2,ng2)
                     DO i = 1, ng2
                        j = pw1%pw_grid%gidx(i)
                        pw2%cc(i) = pw1%cc(j)
                     END DO
                  ELSE
                     CALL pw_zero(pw2)
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw1,pw2,ng1)
                     DO i = 1, ng1
                        j = pw1%pw_grid%gidx(i)
                        pw2%cc(j) = pw1%cc(i)
                     END DO
                  END IF
               ELSE
                  !IF (out_unit > 0) THEN
                  !   WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                  !      " grid 1 :", pw1%pw_grid%id_nr, &
                  !      " spherical :", pw1%pw_grid%spherical, &
                  !      " reference :", pw1%pw_grid%reference
                  !   WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                  !      " grid 2 :", pw2%pw_grid%id_nr, &
                  !      " spherical :", pw2%pw_grid%spherical, &
                  !      " reference :", pw2%pw_grid%reference
                  !END IF
                  CALL pw_copy_match(pw1, pw2)
               END IF

            ELSE
               CPABORT("No suitable data field")
            END IF

            pw2%in_space = RECIPROCALSPACE

         ELSE
            IF (out_unit > 0) THEN
               WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                  " grid 1 :", pw1%pw_grid%id_nr, &
                  " spherical :", pw1%pw_grid%spherical, &
                  " reference :", pw1%pw_grid%reference
               WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                  " grid 2 :", pw2%pw_grid%id_nr, &
                  " spherical :", pw2%pw_grid%spherical, &
                  " reference :", pw2%pw_grid%reference
            END IF
            CPABORT("Incompatible grids")
         END IF

      ELSE

         IF (pw1%in_use == REALDATA1D .AND. pw2%in_use == REALDATA1D) THEN
            ns = SIZE(pw1%cr)
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(ns,pw1,pw2)
            DO i = 1, ns
               pw2%cr(i) = pw1%cr(i)
            END DO
         ELSE IF (pw1%in_use == COMPLEXDATA1D .AND. &
                  pw2%in_use == COMPLEXDATA1D) THEN
            ns = SIZE(pw1%cc)
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(ns,pw2,pw1)
            DO i = 1, ns
               pw2%cc(i) = pw1%cc(i)
            END DO
         ELSE IF (pw1%in_use == REALDATA3D .AND. pw2%in_use == REALDATA3D) THEN
            ns = SIZE(pw1%cr3d)
!$OMP PARALLEL DEFAULT(NONE) SHARED(pw1, pw2)
!$OMP WORKSHARE
            pw2%cr3d(:, :, :) = pw1%cr3d(:, :, :)
!$OMP END WORKSHARE
!$OMP END PARALLEL
         ELSE IF (pw1%in_use == COMPLEXDATA3D .AND. &
                  pw2%in_use == COMPLEXDATA3D) THEN
            ns = SIZE(pw1%cc3d)
!$OMP PARALLEL DEFAULT(NONE) SHARED(pw1, pw2)
!$OMP WORKSHARE
            pw2%cc3d(:, :, :) = pw1%cc3d(:, :, :)
!$OMP END WORKSHARE
!$OMP END PARALLEL
         ELSE
            CPABORT("No suitable data field")
         END IF

      END IF

      pw2%in_space = pw1%in_space
      zc = REAL(ns, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_copy

! **************************************************************************************************
!> \brief multiplies pw coeffs with a number
!> \param pw ...
!> \param a ...
!> \par History
!>      11.2004 created [Joost VandeVondele]
! **************************************************************************************************
   SUBROUTINE pw_scale(pw, a)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), INTENT(IN)                          :: a

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_scale'

      INTEGER                                            :: handle, ns
      REAL(KIND=dp)                                      :: flop

      CALL timeset(routineN, handle)

      SELECT CASE (pw%in_use)
      CASE (REALDATA1D)
         ns = SIZE(pw%cr)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,A)
         pw%cr(:) = a*pw%cr(:)
!$OMP END PARALLEL WORKSHARE
      CASE (COMPLEXDATA1D)
         ns = 2*SIZE(pw%cc)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,A)
         pw%cc(:) = a*pw%cc(:)
!$OMP END PARALLEL WORKSHARE
      CASE (REALDATA3D)
         ns = SIZE(pw%cr3d)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,A)
         pw%cr3d(:, :, :) = a*pw%cr3d(:, :, :)
!$OMP END PARALLEL WORKSHARE
      CASE (COMPLEXDATA3D)
         ns = 2*SIZE(pw%cc3d)
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,A)
         pw%cc3d(:, :, :) = a*pw%cc3d(:, :, :)
!$OMP END PARALLEL WORKSHARE
      CASE DEFAULT
         CPABORT("No suitable data field")
      END SELECT

      flop = REAL(ns, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_scale

! **************************************************************************************************
!> \brief Multiply all data points with a Gaussian damping factor
!>        Needed for longrange Coulomb potential
!>        V(\vec r)=erf(omega*r)/r
!>        V(\vec g)=\frac{4*\pi}{g**2}*exp(-g**2/omega**2)
!> \param pw ...
!> \param omega ...
!> \par History
!>      Frederick Stein (12-04-2019) created
!> \author Frederick Stein (12-Apr-2019)
!> \note
!>      Performs a Gaussian damping
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_gauss_damp(pw, omega)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), INTENT(IN)                          :: omega

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gauss_damp'

      INTEGER                                            :: cnt, handle, n_exp
      REAL(KIND=dp)                                      :: flop, omega_2

      CALL timeset(routineN, handle)
      CPASSERT(omega >= 0)

      flop = 0.0_dp
      n_exp = 0

      omega_2 = omega*omega
      omega_2 = 0.25_dp/omega_2

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(cnt, pw, omega_2)
         pw%cc(:) = pw%cc(:)*EXP(-pw%pw_grid%gsq(:)*omega_2)
!$OMP END PARALLEL WORKSHARE
         flop = flop + 2*cnt
         n_exp = n_exp + cnt

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_gauss_damp

! **************************************************************************************************
!> \brief Multiply all data points with a Gaussian damping factor
!>        Needed for longrange Coulomb potential
!>        V(\vec r)=erf(omega*r)/r
!>        V(\vec g)=\frac{4*\pi}{g**2}*exp(-g**2/omega**2)
!> \param pw ...
!> \param omega ...
!> \par History
!>      Frederick Stein (12-04-2019) created
!> \author Frederick Stein (12-Apr-2019)
!> \note
!>      Performs a Gaussian damping
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_compl_gauss_damp(pw, omega)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), INTENT(IN)                          :: omega

      CHARACTER(len=*), PARAMETER :: routineN = 'pw_compl_gauss_damp'

      INTEGER                                            :: cnt, handle, i, n_exp
      REAL(KIND=dp)                                      :: flop, omega_2, tmp

      CALL timeset(routineN, handle)

      flop = 0.0_dp
      n_exp = 0

      omega_2 = omega*omega
      omega_2 = 0.25_dp/omega_2

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

!$OMP PARALLEL DO PRIVATE(i, tmp) DEFAULT(NONE) SHARED(cnt, pw, omega_2)
         DO i = 1, cnt
            tmp = -omega_2*pw%pw_grid%gsq(i)
            IF (ABS(tmp) > 1.0e-5) THEN
               pw%cc(i) = pw%cc(i)*(1.0_dp - EXP(tmp))
            ELSE
               pw%cc(i) = pw%cc(i)*(tmp + 0.5_dp*tmp*(tmp + (1.0_dp/3.0_dp)*tmp**2))
            END IF
         END DO
         flop = flop + 2*cnt

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_compl_gauss_damp

! **************************************************************************************************
!> \brief Multiply all data points with a Gaussian damping factor and mixes it with the original function
!>        Needed for mixed longrange/Coulomb potential
!>        V(\vec r)=(a+b*erf(omega*r))/r
!>        V(\vec g)=\frac{4*\pi}{g**2}*(a+b*exp(-g**2/omega**2))
!> \param pw ...
!> \param omega ...
!> \param scale_coul ...
!> \param scale_long ...
!> \par History
!>      Frederick Stein (16-Dec-2021) created
!> \author Frederick Stein (16-Dec-2021)
!> \note
!>      Performs a Gaussian damping
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_gauss_damp_mix(pw, omega, scale_coul, scale_long)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), INTENT(IN)                          :: omega, scale_coul, scale_long

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gauss_damp_mix'

      INTEGER                                            :: cnt, handle, n_exp
      REAL(KIND=dp)                                      :: flop, omega_2

      CALL timeset(routineN, handle)

      flop = 0.0_dp
      n_exp = 0

      omega_2 = omega*omega
      omega_2 = 0.25_dp/omega_2

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(cnt, pw, omega_2, scale_coul, scale_long)
         pw%cc(:) = pw%cc(:)*(scale_coul + scale_long*EXP(-pw%pw_grid%gsq(:)*omega_2))
!$OMP END PARALLEL WORKSHARE
         flop = flop + 4*cnt
         n_exp = n_exp + cnt

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_gauss_damp_mix

! **************************************************************************************************
!> \brief Multiply all data points with a complementary cosine
!>        Needed for truncated Coulomb potential
!>        V(\vec r)=1/r if r<rc, 0 else
!>        V(\vec g)=\frac{4*\pi}{g**2}*(1-cos(g*rc))
!> \param pw ...
!> \param rcutoff ...
!> \par History
!>      Frederick Stein (07-06-2021) created
!> \author Frederick Stein (07-Jun-2021)
!> \note
!>      Multiplies by complementary cosine
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_truncated(pw, rcutoff)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), INTENT(IN)                          :: rcutoff

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_truncated'

      INTEGER                                            :: cnt, handle, i, n_cos, n_sqrt
      REAL(KIND=dp)                                      :: flop, tmp

      CALL timeset(routineN, handle)
      CPASSERT(rcutoff >= 0)

      flop = 0.0_dp
      n_cos = 0
      n_sqrt = 0

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

!$OMP PARALLEL DO PRIVATE(i,tmp) DEFAULT(NONE) SHARED(cnt, pw, rcutoff)
         DO i = 1, cnt
            tmp = SQRT(pw%pw_grid%gsq(i))*rcutoff
            IF (tmp >= 0.005_dp) THEN
               pw%cc(i) = pw%cc(i)*(1.0_dp - COS(tmp))
            ELSE
               pw%cc(i) = pw%cc(i)*tmp**2/2.0_dp*(1.0 - tmp**2/12.0_dp)
            END IF
         END DO
         flop = flop + 2*cnt
         n_sqrt = n_sqrt + cnt
         n_cos = n_cos + cnt

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_truncated

! **************************************************************************************************
!> \brief Calculate the derivative of a plane wave vector
!> \param pw ...
!> \param n ...
!> \par History
!>      JGH (06-10-2002) allow only for inplace derivatives
!> \author JGH (25-Feb-2001)
!> \note
!>      Calculate the derivative dx^n(1) dy^n(2) dz^n(3) PW
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_derive(pw, n)

      TYPE(pw_type), INTENT(IN)                          :: pw
      INTEGER, DIMENSION(3), INTENT(IN)                  :: n

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_derive'

      COMPLEX(KIND=dp)                                   :: im
      INTEGER                                            :: cnt, handle, m
      REAL(KIND=dp)                                      :: flop

      CALL timeset(routineN, handle)
      CPASSERT(ALL(n >= 0))

      m = SUM(n)
      im = CMPLX(0.0_dp, 1.0_dp, KIND=dp)**m

      flop = 0.0_dp

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

         IF (n(1) == 1) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(cnt, pw)
            pw%cc(:) = pw%cc(:)*pw%pw_grid%g(1, :)
!$OMP END PARALLEL WORKSHARE
            flop = flop + 6*cnt
         ELSE IF (n(1) > 1) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(cnt, pw,n)
            pw%cc(:) = pw%cc(:)*(pw%pw_grid%g(1, :)**n(1))
!$OMP END PARALLEL WORKSHARE
            flop = flop + 7*cnt
         END IF
         IF (n(2) == 1) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw, cnt)
            pw%cc(:) = pw%cc(:)*pw%pw_grid%g(2, :)
!$OMP END PARALLEL WORKSHARE
            flop = flop + 6*cnt
         ELSE IF (n(2) > 1) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw, cnt,n)
            pw%cc(:) = pw%cc(:)*(pw%pw_grid%g(2, :)**n(2))
!$OMP END PARALLEL WORKSHARE
            flop = flop + 7*cnt
         END IF
         IF (n(3) == 1) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,cnt)
            pw%cc(:) = pw%cc(:)*pw%pw_grid%g(3, :)
!$OMP END PARALLEL WORKSHARE
            flop = flop + 6*cnt
         ELSE IF (n(3) > 1) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,cnt,n)
            pw%cc(:) = pw%cc(:)*(pw%pw_grid%g(3, :)**n(3))
!$OMP END PARALLEL WORKSHARE
            flop = flop + 7*cnt
         END IF

         ! im can take the values 1, -1, i, -i
         ! skip this if im == 1
         IF (ABS(REAL(im, KIND=dp) - 1.0_dp) > 1.0E-10_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw,cnt,im)
            pw%cc(:) = im*pw%cc(:)
!$OMP END PARALLEL WORKSHARE
            flop = flop + 6*cnt
         END IF

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_derive

! **************************************************************************************************
!> \brief Calculate the Laplacian of a plane wave vector
!> \param pw ...
!> \par History
!>      Frederick Stein (01-02-2022) created
!> \author JGH (25-Feb-2001)
!> \note
!>      Calculate the derivative DELTA PW
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_laplace(pw)

      TYPE(pw_type), INTENT(IN)                          :: pw

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_laplace'

      INTEGER                                            :: cnt, handle
      REAL(KIND=dp)                                      :: flop

      CALL timeset(routineN, handle)

      flop = 0.0_dp

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(cnt, pw)
         pw%cc(:) = -pw%cc(:)*pw%pw_grid%gsq(:)
!$OMP END PARALLEL WORKSHARE
         flop = flop + 6*cnt

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_laplace

! **************************************************************************************************
!> \brief Calculate the tensorial 2nd derivative of a plane wave vector
!> \param pw ...
!> \param pwdr2 ...
!> \param i ...
!> \param j ...
!> \par History
!>      none
!> \author JGH (05-May-2006)
!> \note
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_dr2(pw, pwdr2, i, j)

      TYPE(pw_type), INTENT(IN)                          :: pw, pwdr2
      INTEGER, INTENT(IN)                                :: i, j

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_dr2'

      INTEGER                                            :: cnt, handle, ig
      REAL(KIND=dp)                                      :: flop, gg, o3

      CALL timeset(routineN, handle)

      flop = 0.0_dp
      o3 = 1._dp/3._dp

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

         IF (i == j) THEN
!$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(ig,gg) SHARED(i,o3,pw,pwdr2,cnt)
            DO ig = 1, cnt
               gg = pw%pw_grid%g(i, ig)**2 - o3*pw%pw_grid%gsq(ig)
               pwdr2%cc(ig) = gg*pw%cc(ig)
            END DO
            flop = flop + 5*cnt
         ELSE
!$OMP PARALLEL DO PRIVATE (ig) DEFAULT(NONE) SHARED(pwdr2,pw,i,j,cnt)
            DO ig = 1, cnt
               pwdr2%cc(ig) = pw%cc(ig)*(pw%pw_grid%g(i, ig)*pw%pw_grid%g(j, ig))
            END DO
            flop = flop + 4*cnt
         END IF

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_dr2

! **************************************************************************************************
!> \brief Calculate the tensorial 2nd derivative of a plane wave vector
!>      and divides by |G|^2. pwdr2_gg(G=0) is put to zero.
!> \param pw ...
!> \param pwdr2_gg ...
!> \param i ...
!> \param j ...
!> \par History
!>      none
!> \author RD (20-Nov-2006)
!> \note
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
!>      Adapted from pw_dr2
! **************************************************************************************************
   SUBROUTINE pw_dr2_gg(pw, pwdr2_gg, i, j)

      TYPE(pw_type), INTENT(IN)                          :: pw, pwdr2_gg
      INTEGER, INTENT(IN)                                :: i, j

      INTEGER                                            :: cnt, handle, ig
      REAL(KIND=dp)                                      :: flop, gg, o3
      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_dr2_gg'

      CALL timeset(routineN, handle)

      flop = 0.0_dp
      o3 = 1._dp/3._dp

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

         IF (i == j) THEN
!$OMP PARALLEL DO PRIVATE (ig) DEFAULT(NONE) PRIVATE(gg) SHARED(cnt,pw,o3,pwdr2_gg,i)
            DO ig = pw%pw_grid%first_gne0, cnt
               gg = pw%pw_grid%g(i, ig)**2 - o3*pw%pw_grid%gsq(ig)
               pwdr2_gg%cc(ig) = gg*pw%cc(ig)/pw%pw_grid%gsq(ig)
            END DO
            flop = flop + 6*cnt
         ELSE
!$OMP PARALLEL DO PRIVATE (ig) DEFAULT(NONE) SHARED(pwdr2_gg,pw,i,j,cnt)
            DO ig = pw%pw_grid%first_gne0, cnt
               pwdr2_gg%cc(ig) = pw%cc(ig)*(pw%pw_grid%g(i, ig)*pw%pw_grid%g(j, ig)) &
                                 /pw%pw_grid%gsq(ig)
            END DO
            flop = flop + 5*cnt
         END IF

         IF (pw%pw_grid%have_g0) pwdr2_gg%cc(1) = 0.0_dp

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_dr2_gg

! **************************************************************************************************
!> \brief Multiplies a G-space function with a smoothing factor of the form
!>      f(|G|) = exp((ecut - G^2)/sigma)/(1+exp((ecut - G^2)/sigma))
!> \param pw ...
!> \param ecut ...
!> \param sigma ...
!> \par History
!>      none
!> \author JGH (09-June-2006)
!> \note
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_smoothing(pw, ecut, sigma)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), INTENT(IN)                          :: ecut, sigma

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_smoothing'

      INTEGER                                            :: cnt, handle, ig
      REAL(KIND=dp)                                      :: arg, f, flop

      CALL timeset(routineN, handle)

      flop = 0.0_dp

      IF (pw%in_space == RECIPROCALSPACE .AND. &
          pw%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(pw%cc)

!$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(ig,arg,f) SHARED(sigma,ecut,pw,cnt)
         DO ig = 1, cnt
            arg = (ecut - pw%pw_grid%gsq(ig))/sigma
            f = EXP(arg)/(1 + EXP(arg))
            pw%cc(ig) = f*pw%cc(ig)
         END DO
         flop = flop + 6*cnt

      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_smoothing

! **************************************************************************************************
!> \brief Generalize copy of pw types
!> \param pw1 ...
!> \param pw2 ...
!> \param debug ...
!> \par History
!>      JGH (13-Mar-2001) : added gather/scatter cases
!> \author JGH (25-Feb-2001)
!> \note
!>      Copy routine that allows for in_space changes
! **************************************************************************************************
   SUBROUTINE pw_transfer(pw1, pw2, debug)

      TYPE(pw_type), INTENT(IN)                          :: pw1
      TYPE(pw_type), INTENT(INOUT)                       :: pw2
      LOGICAL, INTENT(IN), OPTIONAL                      :: debug

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_transfer'

      INTEGER                                            :: handle

      CALL timeset(routineN, handle)
      !sample peak memory
      CALL m_memory()

      IF (pw1%in_space == REALSPACE .AND. pw2%in_space == REALSPACE) THEN

         ! simple copy should do
         CALL pw_copy(pw1, pw2)

      ELSEIF (pw1%in_space == RECIPROCALSPACE .AND. &
              pw2%in_space == RECIPROCALSPACE) THEN

         IF (pw1%in_use == pw2%in_use) THEN

            ! simple copy should do
            CALL pw_copy(pw1, pw2)

         ELSE

            ! we have to gather/scatter the data
            IF (pw1%in_use == COMPLEXDATA1D) THEN
               CALL pw_scatter(pw1, pw2%cc3d)
            ELSEIF (pw2%in_use == COMPLEXDATA1D) THEN
               CALL pw_gather(pw2, pw1%cc3d)
            ELSE
               CPABORT("Do not know what to do")
            END IF

         END IF

      ELSE

         ! FFT needed, all further tests done in fft_wrap_pw1pw2
         CALL fft_wrap_pw1pw2(pw1, pw2, debug)

      END IF

      CALL timestop(handle)

   END SUBROUTINE pw_transfer

! **************************************************************************************************
!> \brief pw2 = alpha*pw1 + pw2
!>      alpha defaults to 1
!> \param pw1 ...
!> \param pw2 ...
!> \param alpha ...
!> \par History
!>      JGH (21-Feb-2003) : added reference grid functionality
!>      JGH (01-Dec-2007) : rename and remove complex alpha
!> \author apsi
!> \note
!>      Currently only summing up of respective types allowed,
!>      in order to avoid errors
! **************************************************************************************************
   SUBROUTINE pw_axpy(pw1, pw2, alpha)

      TYPE(pw_type), INTENT(IN)                          :: pw1, pw2
      REAL(KIND=dp), INTENT(in), OPTIONAL                :: alpha

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_axpy'

      INTEGER                                            :: handle, i, j, ng, ng1, ng2, out_unit
      REAL(KIND=dp)                                      :: flop, my_alpha

      CALL timeset(routineN, handle)
      out_unit = cp_logger_get_default_io_unit()

      my_alpha = 1.0_dp
      IF (PRESENT(alpha)) my_alpha = alpha

      IF (ASSOCIATED(pw1%pw_grid, pw2%pw_grid)) THEN

         IF (pw1%in_use == REALDATA1D .AND. pw2%in_use == REALDATA1D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(pw2,pw1)
               DO i = 1, SIZE(pw2%cr)
                  pw2%cr(i) = pw2%cr(i) + pw1%cr(i)
               END DO
               flop = REAL(SIZE(pw2%cr), KIND=dp)
            ELSE
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(pw2,pw1,my_alpha)
               DO i = 1, SIZE(pw2%cr)
                  pw2%cr(i) = pw2%cr(i) + my_alpha*pw1%cr(i)
               END DO
               flop = REAL(2*SIZE(pw2%cr), KIND=dp)
            END IF
         ELSE IF (pw1%in_use == COMPLEXDATA1D .AND. &
                  pw2%in_use == COMPLEXDATA1D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(pw2,pw1)
               DO i = 1, SIZE(pw2%cc)
                  pw2%cc(i) = pw2%cc(i) + pw1%cc(i)
               END DO
               flop = REAL(2*SIZE(pw2%cc), KIND=dp)
            ELSE
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(pw2,my_alpha,pw1)
               DO i = 1, SIZE(pw2%cc)
                  pw2%cc(i) = pw2%cc(i) + my_alpha*pw1%cc(i)
               END DO
               flop = REAL(4*SIZE(pw2%cc), KIND=dp)
            END IF
         ELSE IF (pw1%in_use == REALDATA3D .AND. pw2%in_use == REALDATA3D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2)
               pw2%cr3d = pw2%cr3d + pw1%cr3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(SIZE(pw2%cr3d), KIND=dp)
            ELSE
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, my_alpha)
               pw2%cr3d = pw2%cr3d + my_alpha*pw1%cr3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(2*SIZE(pw2%cr3d), KIND=dp)
            END IF
         ELSE IF (pw1%in_use == COMPLEXDATA3D .AND. &
                  pw2%in_use == COMPLEXDATA3D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2)
               pw2%cc3d = pw2%cc3d + pw1%cc3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(2*SIZE(pw2%cc3d), KIND=dp)
            ELSE
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, my_alpha)
               pw2%cc3d = pw2%cc3d + my_alpha*pw1%cc3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(4*SIZE(pw2%cc3d), KIND=dp)
            END IF
         ELSE
            CPABORT("No suitable data field")
         END IF

      ELSE IF (pw_compatible(pw1%pw_grid, pw2%pw_grid)) THEN

         IF (pw1%in_use == COMPLEXDATA1D .AND. &
             pw2%in_use == COMPLEXDATA1D .AND. &
             pw1%in_space == RECIPROCALSPACE .AND. &
             pw2%in_space == RECIPROCALSPACE) THEN

            ng1 = SIZE(pw1%cc)
            ng2 = SIZE(pw2%cc)
            ng = MIN(ng1, ng2)
            flop = REAL(2*ng, KIND=dp)

            IF (pw1%pw_grid%spherical) THEN

               IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(pw2,pw1,ng)
                  DO i = 1, ng
                     pw2%cc(i) = pw2%cc(i) + pw1%cc(i)
                  END DO
               ELSE
!$OMP PARALLEL DO PRIVATE(i) DEFAULT(NONE) SHARED(pw2,pw1,my_alpha,ng)
                  DO i = 1, ng
                     pw2%cc(i) = pw2%cc(i) + my_alpha*pw1%cc(i)
                  END DO
               END IF

            ELSEIF ((pw1%pw_grid%id_nr == pw2%pw_grid%reference)) THEN

               IF (my_alpha == 1.0_dp) THEN
                  IF (ng1 >= ng2) THEN
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,ng2)
                     DO i = 1, ng2
                        j = pw2%pw_grid%gidx(i)
                        pw2%cc(i) = pw2%cc(i) + pw1%cc(j)
                     END DO
                  ELSE
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,ng1)
                     DO i = 1, ng1
                        j = pw2%pw_grid%gidx(i)
                        pw2%cc(j) = pw2%cc(j) + pw1%cc(i)
                     END DO
                  END IF
               ELSE
                  IF (ng1 >= ng2) THEN
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,my_alpha,ng2)
                     DO i = 1, ng2
                        j = pw2%pw_grid%gidx(i)
                        pw2%cc(i) = pw2%cc(i) + my_alpha*pw1%cc(j)
                     END DO
                  ELSE
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,my_alpha,ng1)
                     DO i = 1, ng1
                        j = pw2%pw_grid%gidx(i)
                        pw2%cc(j) = pw2%cc(j) + my_alpha*pw1%cc(i)
                     END DO
                  END IF
               END IF
            ELSEIF ((pw2%pw_grid%id_nr == pw1%pw_grid%reference)) THEN

               IF (my_alpha == 1.0_dp) THEN
                  IF (ng1 >= ng2) THEN
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,ng2)
                     DO i = 1, ng2
                        j = pw1%pw_grid%gidx(i)
                        pw2%cc(i) = pw2%cc(i) + pw1%cc(j)
                     END DO
                  ELSE
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,ng1)
                     DO i = 1, ng1
                        j = pw1%pw_grid%gidx(i)
                        pw2%cc(j) = pw2%cc(j) + pw1%cc(i)
                     END DO
                  END IF
               ELSE
                  IF (ng1 >= ng2) THEN
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,my_alpha,ng2)
                     DO i = 1, ng2
                        j = pw1%pw_grid%gidx(i)
                        pw2%cc(i) = pw2%cc(i) + my_alpha*pw1%cc(j)
                     END DO
                  ELSE
!$OMP PARALLEL DO PRIVATE(i,j) DEFAULT(NONE) SHARED(pw2,pw1,my_alpha,ng1)
                     DO i = 1, ng1
                        j = pw1%pw_grid%gidx(i)
                        pw2%cc(j) = pw2%cc(j) + my_alpha*pw1%cc(i)
                     END DO
                  END IF
               END IF
            ELSE
               IF (out_unit > 0) THEN
                  WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                     " grid 1 :", pw1%pw_grid%id_nr, &
                     " sperical :", pw1%pw_grid%spherical, &
                     " reference :", pw1%pw_grid%reference
                  WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
                     " grid 2 :", pw2%pw_grid%id_nr, &
                     " sperical :", pw2%pw_grid%spherical, &
                     " reference :", pw2%pw_grid%reference
               END IF
               CPABORT("Grids not compatible")

            END IF

         ELSE
            CPABORT("No suitable data field")
         END IF

      ELSE
         IF (out_unit > 0) THEN
            WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
               " grid 1 :", pw1%pw_grid%id_nr, &
               " sperical :", pw1%pw_grid%spherical, &
               " reference :", pw1%pw_grid%reference
            WRITE (out_unit, "(A,I5,T30,A,L1,T60,A,I5)") &
               " grid 2 :", pw2%pw_grid%id_nr, &
               " sperical :", pw2%pw_grid%spherical, &
               " reference :", pw2%pw_grid%reference
         END IF
         CPABORT("Grids not compatible")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_axpy

! **************************************************************************************************
!> \brief pw_out = pw_out + alpha * pw1 * pw2
!>      alpha defaults to 1
!> \param pw_out ...
!> \param pw1 ...
!> \param pw2 ...
!> \param alpha ...
!> \author JGH
! **************************************************************************************************
   SUBROUTINE pw_multiply(pw_out, pw1, pw2, alpha)

      TYPE(pw_type), INTENT(IN)                          :: pw_out, pw1, pw2
      REAL(KIND=dp), INTENT(in), OPTIONAL                :: alpha

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_multiply'

      INTEGER                                            :: handle
      REAL(KIND=dp)                                      :: flop, my_alpha

      CALL timeset(routineN, handle)

      my_alpha = 1.0_dp
      IF (PRESENT(alpha)) my_alpha = alpha

      IF (ASSOCIATED(pw_out%pw_grid, pw2%pw_grid) .AND. &
          ASSOCIATED(pw_out%pw_grid, pw2%pw_grid)) THEN

         IF (pw1%in_use == REALDATA1D .AND. pw2%in_use == REALDATA1D .AND. &
             pw_out%in_use == REALDATA1D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2)
               pw_out%cr = pw_out%cr + pw1%cr*pw2%cr
!$OMP END PARALLEL WORKSHARE
               flop = REAL(2*SIZE(pw2%cr), KIND=dp)
            ELSE
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2, my_alpha)
               pw_out%cr = pw_out%cr + my_alpha*pw1%cr*pw2%cr
!$OMP END PARALLEL WORKSHARE
               flop = REAL(3*SIZE(pw2%cr), KIND=dp)
            END IF
         ELSE IF (pw1%in_use == COMPLEXDATA1D .AND. &
                  pw2%in_use == COMPLEXDATA1D .AND. &
                  pw_out%in_use == COMPLEXDATA1D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2)
               pw_out%cc = pw_out%cc + pw1%cc*pw2%cc
!$OMP END PARALLEL WORKSHARE
               flop = REAL(3*SIZE(pw2%cc), KIND=dp)
            ELSE
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2, my_alpha)
               pw_out%cc = pw_out%cc + my_alpha*pw1%cc*pw2%cc
!$OMP END PARALLEL WORKSHARE
               flop = REAL(6*SIZE(pw2%cc), KIND=dp)
            END IF
         ELSE IF (pw1%in_use == REALDATA3D .AND. pw2%in_use == REALDATA3D .AND. &
                  pw_out%in_use == REALDATA3D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2)
               pw_out%cr3d = pw_out%cr3d + pw1%cr3d*pw2%cr3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(2*SIZE(pw2%cr3d), KIND=dp)
            ELSE
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2, my_alpha)
               pw_out%cr3d = pw_out%cr3d + my_alpha*pw1%cr3d*pw2%cr3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(3*SIZE(pw2%cr3d), KIND=dp)
            END IF
         ELSE IF (pw1%in_use == COMPLEXDATA3D .AND. &
                  pw2%in_use == COMPLEXDATA3D .AND. &
                  pw_out%in_use == COMPLEXDATA3D) THEN
            IF (my_alpha == 1.0_dp) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2)
               pw_out%cc3d = pw_out%cc3d + pw1%cc3d*pw2%cc3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(3*SIZE(pw2%cc3d), KIND=dp)
            ELSE
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw_out, pw1, pw2, my_alpha)
               pw_out%cc3d = pw_out%cc3d + my_alpha*pw1%cc3d*pw2%cc3d
!$OMP END PARALLEL WORKSHARE
               flop = REAL(6*SIZE(pw2%cc3d), KIND=dp)
            END IF
         ELSE
            CPABORT("No suitable data field")
         END IF

      ELSE
         CPABORT("Grids not compatible")
      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_multiply

! **************************************************************************************************
!> \brief Gathers the pw vector from a 3d data field
!> \param pw ...
!> \param c ...
!> \param scale ...
!> \par History
!>      none
!> \author JGH
! **************************************************************************************************
   SUBROUTINE pw_gather_s(pw, c, scale)

      TYPE(pw_type), INTENT(INOUT)                       :: pw
      COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN)   :: c
      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: scale

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gather_s'

      INTEGER                                            :: gpt, handle, l, m, n, ngpts
      INTEGER, DIMENSION(:), POINTER                     :: mapl, mapm, mapn
      INTEGER, DIMENSION(:, :), POINTER                  :: ghat
      REAL(KIND=dp)                                      :: cpy

      CALL timeset(routineN, handle)

      IF (pw%in_use /= COMPLEXDATA1D) THEN
         CPABORT("Data field has to be COMPLEXDATA1D")
      END IF

      ! after the gather we are in g-space
      pw%in_space = RECIPROCALSPACE

      mapl => pw%pw_grid%mapl%pos
      mapm => pw%pw_grid%mapm%pos
      mapn => pw%pw_grid%mapn%pos

      ngpts = SIZE(pw%pw_grid%gsq)

      ghat => pw%pw_grid%g_hat

      IF (PRESENT(scale)) THEN

!$OMP PARALLEL DO PRIVATE(gpt,l,m,n) DEFAULT(NONE) SHARED(ngpts, mapl,mapm,mapn,ghat,scale,pw,c)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            pw%cc(gpt) = scale*c(l, m, n)

         END DO

      ELSE

!$OMP PARALLEL DO PRIVATE(gpt,l,m,n) DEFAULT(NONE) SHARED(ngpts, mapl,mapm,mapn,ghat,pw,c)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            pw%cc(gpt) = c(l, m, n)

         END DO

      END IF

      cpy = REAL(ngpts, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_gather_s

! **************************************************************************************************
!> \brief ...
!> \param pw ...
!> \param c ...
!> \param scale ...
! **************************************************************************************************
   SUBROUTINE pw_gather_p(pw, c, scale)

      TYPE(pw_type), INTENT(INOUT)                       :: pw
      COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN)      :: c
      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: scale

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gather_p'

      INTEGER                                            :: gpt, handle, l, m, mn, n, ngpts
      INTEGER, DIMENSION(:), POINTER                     :: mapl, mapm, mapn
      INTEGER, DIMENSION(:, :), POINTER                  :: ghat, yzq
      REAL(KIND=dp)                                      :: cpy

      CALL timeset(routineN, handle)

      IF (pw%in_use /= COMPLEXDATA1D) THEN
         CPABORT("Data field has to be COMPLEXDATA1D")
      END IF

      IF (pw%pw_grid%para%mode /= PW_MODE_DISTRIBUTED) THEN
         CPABORT("This grid type is not distributed")
      END IF

      ! after the gather we are in g-space
      pw%in_space = RECIPROCALSPACE

      mapl => pw%pw_grid%mapl%pos
      mapm => pw%pw_grid%mapm%pos
      mapn => pw%pw_grid%mapn%pos

      ngpts = SIZE(pw%pw_grid%gsq)

      ghat => pw%pw_grid%g_hat
      yzq => pw%pw_grid%para%yzq

      IF (PRESENT(scale)) THEN
!$OMP PARALLEL DO DEFAULT(NONE), &
!$OMP             PRIVATE(l,m,n,mn), &
!$OMP             SHARED(ngpts,mapl,mapm,mapn,ghat,yzq,pw,c,scale)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            mn = yzq(m, n)
            pw%cc(gpt) = scale*c(l, mn)

         END DO
!$OMP END PARALLEL DO
      ELSE
!$OMP PARALLEL DO DEFAULT(NONE), &
!$OMP             PRIVATE(l,m,n,mn), &
!$OMP             SHARED(ngpts,mapl,mapm,mapn,ghat,yzq,pw,c)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            mn = yzq(m, n)
            pw%cc(gpt) = c(l, mn)

         END DO
!$OMP END PARALLEL DO
      END IF

      cpy = REAL(ngpts, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_gather_p

! **************************************************************************************************
!> \brief Scatters a pw vector to a 3d data field
!> \param pw ...
!> \param c ...
!> \param scale ...
!> \par History
!>      none
!> \author JGH
! **************************************************************************************************
   SUBROUTINE pw_scatter_s(pw, c, scale)

      TYPE(pw_type), INTENT(IN)                          :: pw
      COMPLEX(KIND=dp), DIMENSION(:, :, :), &
         INTENT(INOUT)                                   :: c
      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: scale

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_scatter_s'

      INTEGER                                            :: gpt, handle, l, m, n, ngpts
      INTEGER, DIMENSION(:), POINTER                     :: mapl, mapm, mapn
      INTEGER, DIMENSION(:, :), POINTER                  :: ghat
      REAL(KIND=dp)                                      :: cpy

      CALL timeset(routineN, handle)

      IF (pw%in_use /= COMPLEXDATA1D) THEN
         CPABORT("Data field has to be COMPLEXDATA1D")
      END IF

      IF (pw%in_space /= RECIPROCALSPACE) THEN
         CPABORT("Data has to be in RECIPROCALSPACE")
      END IF

      mapl => pw%pw_grid%mapl%pos
      mapm => pw%pw_grid%mapm%pos
      mapn => pw%pw_grid%mapn%pos

      ghat => pw%pw_grid%g_hat

      ngpts = SIZE(pw%pw_grid%gsq)

      ! should only zero the unused bits (but the zero is needed)
      IF (.NOT. PRESENT(scale)) c = 0.0_dp

      IF (PRESENT(scale)) THEN

!$OMP PARALLEL DO PRIVATE(gpt,l,m,n) DEFAULT(NONE) SHARED(ngpts,mapl,mapm,mapn,ghat,scale,pw,c)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            c(l, m, n) = scale*pw%cc(gpt)

         END DO

      ELSE

!$OMP PARALLEL DO PRIVATE(gpt,l,m,n) DEFAULT(NONE) SHARED(ngpts,mapl,mapm,mapn,ghat,pw,c)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            c(l, m, n) = pw%cc(gpt)

         END DO

      END IF

      IF (pw%pw_grid%grid_span == HALFSPACE) THEN

         mapl => pw%pw_grid%mapl%neg
         mapm => pw%pw_grid%mapm%neg
         mapn => pw%pw_grid%mapn%neg

         IF (PRESENT(scale)) THEN

!$OMP PARALLEL DO PRIVATE(gpt,l,m,n) DEFAULT(NONE) SHARED(ngpts, mapl,mapm,mapn,ghat,scale,pw,c)
            DO gpt = 1, ngpts

               l = mapl(ghat(1, gpt)) + 1
               m = mapm(ghat(2, gpt)) + 1
               n = mapn(ghat(3, gpt)) + 1
               c(l, m, n) = scale*CONJG(pw%cc(gpt))

            END DO

         ELSE

!$OMP PARALLEL DO PRIVATE(gpt,l,m,n) DEFAULT(NONE) SHARED(ngpts, mapl,mapm,mapn,ghat,pw,c)
            DO gpt = 1, ngpts

               l = mapl(ghat(1, gpt)) + 1
               m = mapm(ghat(2, gpt)) + 1
               n = mapn(ghat(3, gpt)) + 1
               c(l, m, n) = CONJG(pw%cc(gpt))

            END DO

         END IF

      END IF

      cpy = REAL(ngpts, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_scatter_s

! **************************************************************************************************
!> \brief ...
!> \param pw ...
!> \param c ...
!> \param scale ...
! **************************************************************************************************
   SUBROUTINE pw_scatter_p(pw, c, scale)

      TYPE(pw_type), INTENT(IN)                          :: pw
      COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT)   :: c
      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: scale

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_scatter_p'

      INTEGER                                            :: gpt, handle, l, m, mn, n, ngpts
      INTEGER, DIMENSION(:), POINTER                     :: mapl, mapm, mapn
      INTEGER, DIMENSION(:, :), POINTER                  :: ghat, yzq
      REAL(KIND=dp)                                      :: cpy

      CALL timeset(routineN, handle)

      IF (pw%in_use /= COMPLEXDATA1D) THEN
         CPABORT("Data field has to be COMPLEXDATA1D")
      END IF

      IF (pw%in_space /= RECIPROCALSPACE) THEN
         CPABORT("Data has to be in RECIPROCALSPACE")
      END IF

      IF (pw%pw_grid%para%mode /= PW_MODE_DISTRIBUTED) THEN
         CPABORT("This grid type is not distributed")
      END IF

      mapl => pw%pw_grid%mapl%pos
      mapm => pw%pw_grid%mapm%pos
      mapn => pw%pw_grid%mapn%pos

      ghat => pw%pw_grid%g_hat
      yzq => pw%pw_grid%para%yzq

      ngpts = SIZE(pw%pw_grid%gsq)

      IF (.NOT. PRESENT(scale)) CALL zero_c(c)

      IF (PRESENT(scale)) THEN
!$OMP PARALLEL DO DEFAULT(NONE), &
!$OMP             PRIVATE(l,m,n,mn), &
!$OMP             SHARED(ngpts,mapl,mapm,mapn,ghat,yzq,pw,c,scale)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            mn = yzq(m, n)
            c(l, mn) = scale*pw%cc(gpt)

         END DO
!$OMP END PARALLEL DO
      ELSE
!$OMP PARALLEL DO DEFAULT(NONE), &
!$OMP             PRIVATE(l,m,n,mn), &
!$OMP             SHARED(ngpts,mapl,mapm,mapn,ghat,yzq,pw,c)
         DO gpt = 1, ngpts

            l = mapl(ghat(1, gpt)) + 1
            m = mapm(ghat(2, gpt)) + 1
            n = mapn(ghat(3, gpt)) + 1
            mn = yzq(m, n)
            c(l, mn) = pw%cc(gpt)

         END DO
!$OMP END PARALLEL DO
      END IF

      IF (pw%pw_grid%grid_span == HALFSPACE) THEN

         mapm => pw%pw_grid%mapm%neg
         mapn => pw%pw_grid%mapn%neg
         mapl => pw%pw_grid%mapl%neg

         IF (PRESENT(scale)) THEN
!$OMP PARALLEL DO DEFAULT(NONE), &
!$OMP             PRIVATE(l,m,n,mn), &
!$OMP             SHARED(ngpts,mapl,mapm,mapn,ghat,yzq,pw,c,scale)
            DO gpt = 1, ngpts

               l = mapl(ghat(1, gpt)) + 1
               m = mapm(ghat(2, gpt)) + 1
               n = mapn(ghat(3, gpt)) + 1
               mn = yzq(m, n)
               c(l, mn) = scale*CONJG(pw%cc(gpt))

            END DO
!$OMP END PARALLEL DO
         ELSE
!$OMP PARALLEL DO DEFAULT(NONE), &
!$OMP             PRIVATE(l,m,n,mn), &
!$OMP             SHARED(ngpts,mapl,mapm,mapn,ghat,yzq,pw,c)
            DO gpt = 1, ngpts

               l = mapl(ghat(1, gpt)) + 1
               m = mapm(ghat(2, gpt)) + 1
               n = mapn(ghat(3, gpt)) + 1
               mn = yzq(m, n)
               c(l, mn) = CONJG(pw%cc(gpt))

            END DO
!$OMP END PARALLEL DO
         END IF

      END IF

      cpy = REAL(ngpts, KIND=dp)*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_scatter_p

! **************************************************************************************************
!> \brief Generic function for 3d FFT of a coefficient_type or pw_type
!> \param pw1 ...
!> \param pw2 ...
!> \param debug ...
!> \par History
!>      JGH (30-12-2000): New setup of functions and adaptation to parallelism
!>      JGH (04-01-2001): Moved routine from pws to this module, only covers
!>                        pw_types, no more coefficient types
!> \author apsi
!> \note
!>       fft_wrap_pw1pw2
! **************************************************************************************************
   SUBROUTINE fft_wrap_pw1pw2(pw1, pw2, debug)

      TYPE(pw_type), INTENT(IN)                  :: pw1
      TYPE(pw_type), INTENT(INOUT)               :: pw2
      LOGICAL, INTENT(IN), OPTIONAL                      :: debug

      CHARACTER(len=*), PARAMETER                        :: routineN = 'fft_wrap_pw1pw2'

      CHARACTER(LEN=9)                                   :: mode
      COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays
      COMPLEX(KIND=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER      :: c_in, c_out
      INTEGER                                            :: dir, handle, handle2, my_pos, nrays, &
                                                            out_space, out_unit
      INTEGER, DIMENSION(3)                              :: nloc
      INTEGER, DIMENSION(:), POINTER                     :: n
      LOGICAL                                            :: test
#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      LOGICAL                                            :: use_pw_gpu
#endif
      REAL(KIND=dp)                                      :: norm

      CALL timeset(routineN, handle2)
      out_unit = cp_logger_get_default_io_unit()
      CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string( &
                                               CEILING(pw1%pw_grid%cutoff/10)*10))), handle)

      NULLIFY (c_in)
      NULLIFY (c_out)

      IF (PRESENT(debug)) THEN
         test = debug
      ELSE
         test = .FALSE.
      END IF

      !..check if grids are compatible
      IF (.NOT. ASSOCIATED(pw1%pw_grid, pw2%pw_grid)) THEN
         IF (pw1%pw_grid%dvol /= pw2%pw_grid%dvol) THEN
            CPABORT("PW grids not compatible")
         END IF
         IF (pw1%pw_grid%para%group /= pw2%pw_grid%para%group) THEN
            CPABORT("PW grids have not compatible MPI groups")
         END IF
      END IF

      !..prepare input
      IF (pw1%in_space == REALSPACE) THEN
         dir = FWFFT
         norm = 1.0_dp/pw1%pw_grid%ngpts
         out_space = RECIPROCALSPACE
      ELSE IF (pw1%in_space == RECIPROCALSPACE) THEN
         dir = BWFFT
         norm = 1.0_dp
         out_space = REALSPACE
      ELSE
         CPABORT("Error in space tag")
      END IF

      n => pw1%pw_grid%npts

      mode = fftselect(pw1%in_use, pw2%in_use, pw1%in_space)

      IF (pw1%pw_grid%para%mode == PW_MODE_LOCAL) THEN

         !
         !..replicated data, use local FFT
         !

         IF (test .AND. out_unit > 0) THEN
            WRITE (out_unit, '(A)') " FFT Protocol "
            IF (dir == FWFFT) WRITE (out_unit, '(A,T76,A)') "  Transform direction ", "FWFFT"
            IF (dir == BWFFT) WRITE (out_unit, '(A,T76,A)') "  Transform direction ", "BWFFT"
            IF (pw1%in_space == REALSPACE) &
               WRITE (out_unit, '(A,T72,A)') "  in space ", "REALSPACE"
            IF (pw1%in_space == RECIPROCALSPACE) &
               WRITE (out_unit, '(A,T66,A)') "  in space ", "RECIPROCALSPACE"
            IF (out_space == REALSPACE) &
               WRITE (out_unit, '(A,T72,A)') "  out space ", "REALSPACE"
            IF (out_space == RECIPROCALSPACE) &
               WRITE (out_unit, '(A,T66,A)') "  out space ", "RECIPROCALSPACE"
            WRITE (out_unit, '(A,T66,E15.6)') "  scale factor", norm
         END IF

         SELECT CASE (mode)
         CASE DEFAULT
            CPABORT("Illegal combination of in_use and in_space")
         CASE ("FW_C3DC3D")
            c_in => pw1%cc3d
            c_out => pw2%cc3d
            CALL fft3d(dir, n, c_in, c_out, scale=norm, debug=test)
         CASE ("FW_R3DC3D")
            pw2%cc3d = CMPLX(pw1%cr3d, 0.0_dp, KIND=dp)
            c_out => pw2%cc3d
            CALL fft3d(dir, n, c_out, scale=norm, debug=test)
         CASE ("FW_C3DC1D")
            c_in => pw1%cc3d
            ALLOCATE (c_out(n(1), n(2), n(3)))
            ! transform
            CALL fft3d(dir, n, c_in, c_out, scale=norm, debug=test)
            ! gather results
            IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  PW_GATHER : 3d -> 1d "
            CALL pw_gather(pw2, c_out)
            DEALLOCATE (c_out)
         CASE ("FW_R3DC1D")
#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
            CALL pw_gpu_r3dc1d_3d(pw1, pw2, scale=norm)
#elif defined (__PW_FPGA)
            ALLOCATE (c_out(n(1), n(2), n(3)))
            ! check if bitstream for the fft size is present
            ! if not, perform fft3d in CPU
            IF (pw_fpga_init_bitstream(n) == 1) THEN
               CALL copy_rc(pw1%cr3d, c_out)
#if (__PW_FPGA_SP && __PW_FPGA)
               CALL pw_fpga_r3dc1d_3d_sp(n, c_out)
#else
               CALL pw_fpga_r3dc1d_3d_dp(n, c_out)
#endif
               CALL zdscal(n(1)*n(2)*n(3), norm, c_out, 1)
               CALL pw_gather(pw2, c_out)
            ELSE
               CALL copy_rc(pw1%cr3d, c_out)
               CALL fft3d(dir, n, c_out, scale=norm, debug=test)
               CALL pw_gather(pw2, c_out)
            END IF
            DEALLOCATE (c_out)
#else
            ALLOCATE (c_out(n(1), n(2), n(3)))
            CALL copy_rc(pw1%cr3d, c_out)
            CALL fft3d(dir, n, c_out, scale=norm, debug=test)
            CALL pw_gather(pw2, c_out)
            DEALLOCATE (c_out)
#endif
         CASE ("BW_C3DC3D")
            c_in => pw1%cc3d
            c_out => pw2%cc3d
            CALL fft3d(dir, n, c_in, c_out, scale=norm, debug=test)
         CASE ("BW_C3DR3D")
            c_in => pw1%cc3d
            ALLOCATE (c_out(n(1), n(2), n(3)))
            CALL fft3d(dir, n, c_in, c_out, scale=norm, debug=test)
            ! use real part only
            IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  REAL part "
            pw2%cr3d = REAL(c_out, KIND=dp)
            DEALLOCATE (c_out)
         CASE ("BW_C1DC3D")
            c_out => pw2%cc3d
            IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  PW_SCATTER : 3d -> 1d "
            CALL pw_scatter(pw1, c_out)
            CALL fft3d(dir, n, c_out, scale=norm, debug=test)
         CASE ("BW_C1DR3D")
#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
            CALL pw_gpu_c1dr3d_3d(pw1, pw2, scale=norm)
#elif defined (__PW_FPGA)
            ALLOCATE (c_out(n(1), n(2), n(3)))
            ! check if bitstream for the fft size is present
            ! if not, perform fft3d in CPU
            IF (pw_fpga_init_bitstream(n) == 1) THEN
               CALL pw_scatter(pw1, c_out)
               ! transform using FPGA
#if (__PW_FPGA_SP && __PW_FPGA)
               CALL pw_fpga_c1dr3d_3d_sp(n, c_out)
#else
               CALL pw_fpga_c1dr3d_3d_dp(n, c_out)
#endif
               CALL zdscal(n(1)*n(2)*n(3), norm, c_out, 1)
               ! use real part only
               CALL copy_cr(c_out, pw2%cr3d)
            ELSE
               IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  PW_SCATTER : 3d -> 1d "
               CALL pw_scatter(pw1, c_out)
               ! transform
               CALL fft3d(dir, n, c_out, scale=norm, debug=test)
               ! use real part only
               IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  REAL part "
               CALL copy_cr(c_out, pw2%cr3d)
            END IF
            DEALLOCATE (c_out)
#else
            ALLOCATE (c_out(n(1), n(2), n(3)))
            IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  PW_SCATTER : 3d -> 1d "
            CALL pw_scatter(pw1, c_out)
            ! transform
            CALL fft3d(dir, n, c_out, scale=norm, debug=test)
            ! use real part only
            IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') "  REAL part "
            CALL copy_cr(c_out, pw2%cr3d)
            DEALLOCATE (c_out)
#endif
         END SELECT

         IF (test .AND. out_unit > 0) WRITE (out_unit, '(A)') " End of FFT Protocol "

      ELSE

         !
         !..parallel FFT
         !

         IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) THEN
            WRITE (out_unit, '(A)') " FFT Protocol "
            IF (dir == FWFFT) WRITE (out_unit, '(A,T76,A)') "  Transform direction ", "FWFFT"
            IF (dir == BWFFT) WRITE (out_unit, '(A,T76,A)') "  Transform direction ", "BWFFT"
            IF (pw1%in_space == REALSPACE) &
               WRITE (out_unit, '(A,T72,A)') "  in space ", "REALSPACE"
            IF (pw1%in_space == RECIPROCALSPACE) &
               WRITE (out_unit, '(A,T66,A)') "  in space ", "RECIPROCALSPACE"
            IF (out_space == REALSPACE) &
               WRITE (out_unit, '(A,T72,A)') "  out space ", "REALSPACE"
            IF (out_space == RECIPROCALSPACE) &
               WRITE (out_unit, '(A,T66,A)') "  out space ", "RECIPROCALSPACE"
            WRITE (out_unit, '(A,T66,E15.6)') "  scale factor", norm
         END IF

         my_pos = pw1%pw_grid%para%my_pos
         nrays = pw1%pw_grid%para%nyzray(my_pos)
         grays => pw1%pw_grid%grays
         CPASSERT(SIZE(grays, 1) == n(1))
         CPASSERT(SIZE(grays, 2) == nrays)

         SELECT CASE (mode)
         CASE DEFAULT
            CALL cp_abort(__LOCATION__, &
                          "Illegal combination of in_use and in_space "// &
                          "in parallel 3d FFT")
         CASE ("FW_C3DC1D")
            !..prepare input
            c_in => pw1%cc3d
            CALL zero_c(grays)
            !..transform
            IF (pw1%pw_grid%para%ray_distribution) THEN
               CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%group, &
                          pw1%pw_grid%para%rs_group, &
                          pw1%pw_grid%para%yzp, pw1%pw_grid%para%nyzray, &
                          pw1%pw_grid%para%bo, scale=norm, debug=test)
            ELSE
               CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%rs_group, &
                          pw1%pw_grid%para%bo, scale=norm, debug=test)
            END IF
            !..prepare output
            IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) &
               WRITE (out_unit, '(A)') "  PW_GATHER : 2d -> 1d "
            CALL pw_gather(pw2, grays)
         CASE ("FW_R3DC1D")
#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
            ! (no ray dist. is not efficient in CUDA)
            use_pw_gpu = pw1%pw_grid%para%ray_distribution
            IF (use_pw_gpu) THEN
               CALL pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale=norm)
            ELSE
#endif
!..   prepare input
               nloc = pw1%pw_grid%npts_local
               ALLOCATE (c_in(nloc(1), nloc(2), nloc(3)))
               CALL copy_rc(pw1%cr3d, c_in)
               CALL zero_c(grays)
               !..transform
               IF (pw1%pw_grid%para%ray_distribution) THEN
                  CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%group, &
                             pw1%pw_grid%para%rs_group, &
                             pw1%pw_grid%para%yzp, pw1%pw_grid%para%nyzray, &
                             pw1%pw_grid%para%bo, scale=norm, debug=test)
               ELSE
                  CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%rs_group, &
                             pw1%pw_grid%para%bo, scale=norm, debug=test)
               END IF
               !..prepare output
               IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) &
                  WRITE (out_unit, '(A)') "  PW_GATHER : 2d -> 1d "
               CALL pw_gather(pw2, grays)
               DEALLOCATE (c_in)

#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
            END IF
#endif
         CASE ("BW_C1DC3D")
            !..prepare input
            IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) &
               WRITE (out_unit, '(A)') "  PW_SCATTER : 2d -> 1d "
            CALL zero_c(grays)
            CALL pw_scatter(pw1, grays)
            c_in => pw2%cc3d
            !..transform
            IF (pw1%pw_grid%para%ray_distribution) THEN
               CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%group, &
                          pw1%pw_grid%para%rs_group, &
                          pw1%pw_grid%para%yzp, pw1%pw_grid%para%nyzray, &
                          pw1%pw_grid%para%bo, scale=norm, debug=test)
            ELSE
               CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%rs_group, &
                          pw1%pw_grid%para%bo, scale=norm, debug=test)
            END IF
            !..prepare output (nothing to do)
         CASE ("BW_C1DR3D")
#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
            ! (no ray dist. is not efficient in CUDA)
            use_pw_gpu = pw1%pw_grid%para%ray_distribution
            IF (use_pw_gpu) THEN
               CALL pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale=norm)
            ELSE
#endif
!..   prepare input
               IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) &
                  WRITE (out_unit, '(A)') "  PW_SCATTER : 2d -> 1d "
               CALL zero_c(grays)
               CALL pw_scatter(pw1, grays)
               nloc = pw2%pw_grid%npts_local
               ALLOCATE (c_in(nloc(1), nloc(2), nloc(3)))
               !..transform
               IF (pw1%pw_grid%para%ray_distribution) THEN
                  CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%group, &
                             pw1%pw_grid%para%rs_group, &
                             pw1%pw_grid%para%yzp, pw1%pw_grid%para%nyzray, &
                             pw1%pw_grid%para%bo, scale=norm, debug=test)
               ELSE
                  CALL fft3d(dir, n, c_in, grays, pw1%pw_grid%para%rs_group, &
                             pw1%pw_grid%para%bo, scale=norm, debug=test)
               END IF
               !..prepare output
               IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) &
                  WRITE (out_unit, '(A)') "  Real part "
               CALL copy_cr(c_in, pw2%cr3d)
               DEALLOCATE (c_in)
#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
            END IF
#endif
         END SELECT

      END IF

      !..update the space tag for pw2
      pw2%in_space = out_space

      IF (test .AND. pw1%pw_grid%para%group_head .AND. out_unit > 0) THEN
         WRITE (out_unit, '(A)') " End of FFT Protocol "
      END IF
      CALL timestop(handle)
      CALL timestop(handle2)

   END SUBROUTINE fft_wrap_pw1pw2

! **************************************************************************************************
!> \brief ...
!> \param use1 ...
!> \param use2 ...
!> \param space1 ...
!> \return ...
! **************************************************************************************************
   FUNCTION fftselect(use1, use2, space1) RESULT(mode)
      INTEGER, INTENT(IN)                                :: use1, use2, space1
      CHARACTER(LEN=9)                                   :: mode

      IF (space1 == REALSPACE) THEN
         mode(1:3) = "FW_"
      ELSE IF (space1 == RECIPROCALSPACE) THEN
         mode(1:3) = "BW_"
      ELSE
         CPABORT("Error in space tag")
      END IF

      SELECT CASE (use1)
      CASE (COMPLEXDATA3D)
         mode(4:6) = "C3D"
      CASE (REALDATA3D)
         mode(4:6) = "R3D"
      CASE (COMPLEXDATA1D)
         mode(4:6) = "C1D"
      CASE (REALDATA1D)
         mode(4:6) = "R1D"
      CASE DEFAULT
         CPABORT("Error in use1 tag")
      END SELECT

      SELECT CASE (use2)
      CASE (COMPLEXDATA3D)
         mode(7:9) = "C3D"
      CASE (REALDATA3D)
         mode(7:9) = "R3D"
      CASE (COMPLEXDATA1D)
         mode(7:9) = "C1D"
      CASE (REALDATA1D)
         mode(7:9) = "R1D"
      CASE DEFAULT
         CPABORT("Error in use1 tag")
      END SELECT

   END FUNCTION fftselect

! **************************************************************************************************
!> \brief writes a small description of the actual grid
!>      (change to output the data as cube file, maybe with an
!>      optional long_description arg?)
!> \param pw the pw data to output
!> \param unit_nr the unit to output to
!> \par History
!>      08.2002 created [fawzi]
!> \author Fawzi Mohamed
! **************************************************************************************************
   SUBROUTINE pw_write(pw, unit_nr)
      TYPE(pw_type), INTENT(in)                          :: pw
      INTEGER, INTENT(in)                                :: unit_nr

      INTEGER                                            :: iostatus

      WRITE (unit=unit_nr, fmt="('<pw>:{ ')", iostat=iostatus)

      SELECT CASE (pw%in_use)
      CASE (REALDATA1D)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_use=REALDATA1D"
         IF (ASSOCIATED(pw%cr)) THEN
            WRITE (unit=unit_nr, fmt="(' cr=<real(',i8,':',i8,')>')") &
               LBOUND(pw%cr, 1), UBOUND(pw%cr, 1)
         ELSE
            WRITE (unit=unit_nr, fmt="(' cr=*null*')")
         END IF
      CASE (REALDATA3D)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_use=REALDATA3D"
         IF (ASSOCIATED(pw%cr3d)) THEN
            WRITE (unit=unit_nr, fmt="(' cr3d=<real(',i8,':',i8,',',i8,':',i8,',',i8,':',i8,')>')") &
               LBOUND(pw%cr3d, 1), UBOUND(pw%cr3d, 1), LBOUND(pw%cr3d, 2), UBOUND(pw%cr3d, 2), &
               LBOUND(pw%cr3d, 3), UBOUND(pw%cr3d, 3)
         ELSE
            WRITE (unit=unit_nr, fmt="(' cr3d=*null*')")
         END IF
      CASE (COMPLEXDATA1D)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_use=COMPLEXDATA1D"
         IF (ASSOCIATED(pw%cc)) THEN
            WRITE (unit=unit_nr, fmt="(' cc=<real(',i8,':',i8,')>')") &
               LBOUND(pw%cc, 1), UBOUND(pw%cc, 1)
         ELSE
            WRITE (unit=unit_nr, fmt="(' cc=*null*')")
         END IF
      CASE (COMPLEXDATA3D)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_use=COMPLEXDATA3D"
         IF (ASSOCIATED(pw%cc3d)) THEN
            WRITE (unit=unit_nr, fmt="(' cc3d=<real(',i8,':',i8,',',i8,':',i8,',',i8,':',i8,')>')") &
               LBOUND(pw%cc3d, 1), UBOUND(pw%cc3d, 1), LBOUND(pw%cc3d, 2), UBOUND(pw%cc3d, 2), &
               LBOUND(pw%cc3d, 3), UBOUND(pw%cc3d, 3)
         ELSE
            WRITE (unit=unit_nr, fmt="(' cr3d=*null*')")
         END IF
      CASE default
         WRITE (unit=unit_nr, fmt="(' in_use=',i8,',')", iostat=iostatus) &
            pw%in_use
      END SELECT

      SELECT CASE (pw%in_space)
      CASE (NOSPACE)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_space=NOSPACE"
      CASE (REALSPACE)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_space=REALSPACE"
      CASE (RECIPROCALSPACE)
         WRITE (unit=unit_nr, fmt="(a)", iostat=iostatus) " in_space=RECIPROCALSPACE"
      CASE default
         WRITE (unit=unit_nr, fmt="(' in_space=',i8,',')", iostat=iostatus) &
            pw%in_space
      END SELECT

      WRITE (unit=unit_nr, fmt="(' pw_grid%id_nr=',i8,/,' }')", iostat=iostatus) &
         pw%pw_grid%id_nr

   END SUBROUTINE pw_write

! **************************************************************************************************
!> \brief ...
!> \param grida ...
!> \param gridb ...
!> \return ...
! **************************************************************************************************
   ELEMENTAL FUNCTION pw_compatible(grida, gridb) RESULT(compat)
      TYPE(pw_grid_type), INTENT(IN)                     :: grida, gridb
      LOGICAL                                            :: compat

      compat = .FALSE.
      IF (grida%id_nr == gridb%id_nr) THEN
         compat = .TRUE.
      ELSE IF (grida%reference == gridb%id_nr) THEN
         compat = .TRUE.
      ELSE IF (gridb%reference == grida%id_nr) THEN
         compat = .TRUE.
      END IF

   END FUNCTION pw_compatible

! **************************************************************************************************
!> \brief Calculate integral over unit cell for functions in plane wave basis
!>      only returns the real part of it ......
!> \param pw1 ...
!> \param pw2 ...
!> \param sumtype ...
!> \return ...
!> \par History
!>      JGH (14-Mar-2001) : Parallel sum and some tests, HALFSPACE case
!> \author apsi
! **************************************************************************************************
   FUNCTION pw_integral_ab(pw1, pw2, sumtype) RESULT(integral_value)

      TYPE(pw_type), INTENT(IN)                          :: pw1, pw2
      INTEGER, INTENT(IN), OPTIONAL                      :: sumtype
      REAL(KIND=dp)                                      :: integral_value

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_integral_ab'

      INTEGER                                            :: handle, loc_sumtype

      CALL timeset(routineN, handle)

      loc_sumtype = do_accurate_sum
      IF (PRESENT(sumtype)) loc_sumtype = sumtype

      IF (.NOT. ASSOCIATED(pw1%pw_grid, pw2%pw_grid)) THEN
         CPABORT("Grids incompatible")
      END IF

      ! do standard sum
      IF (loc_sumtype == do_standard_sum) THEN

         ! since the return value is real, only do accurate sum on the real bit ?
         IF (pw1%in_use == REALDATA3D .AND. pw2%in_use == REALDATA3D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = SUM(pw1%cr3d(:, :, :) &
                                 *pw2%cr3d(:, :, :))
!$OMP END PARALLEL WORKSHARE
         ELSE IF (pw1%in_use == REALDATA3D &
                  .AND. pw2%in_use == COMPLEXDATA3D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = REAL(SUM(pw1%cr3d(:, :, :) &
                                      *pw2%cc3d(:, :, :)), KIND=dp) !? complex bit
!$OMP END PARALLEL WORKSHARE
         ELSE IF (pw1%in_use == COMPLEXDATA3D &
                  .AND. pw2%in_use == REALDATA3D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = REAL(SUM(pw1%cc3d(:, :, :) &
                                      *pw2%cr3d(:, :, :)), KIND=dp) !? complex bit
!$OMP END PARALLEL WORKSHARE
         ELSE IF (pw1%in_use == COMPLEXDATA3D &
                  .AND. pw2%in_use == COMPLEXDATA3D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = REAL(SUM(CONJG(pw1%cc3d(:, :, :)) &
                                      *pw2%cc3d(:, :, :)), KIND=dp) !? complex bit
!$OMP END PARALLEL WORKSHARE

         ELSE IF (pw1%in_use == REALDATA1D &
                  .AND. pw2%in_use == REALDATA1D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = DOT_PRODUCT(pw1%cr(:), pw2%cr(:))
!$OMP END PARALLEL WORKSHARE
         ELSE IF (pw1%in_use == REALDATA1D &
                  .AND. pw2%in_use == COMPLEXDATA1D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = REAL(DOT_PRODUCT(pw1%cr(:), pw2%cc(:)), KIND=dp) !? complex bit
!$OMP END PARALLEL WORKSHARE
         ELSE IF (pw1%in_use == COMPLEXDATA1D &
                  .AND. pw2%in_use == REALDATA1D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = REAL(DOT_PRODUCT(pw1%cc(:), pw2%cr(:)), KIND=dp) !? complex bit
!$OMP END PARALLEL WORKSHARE
         ELSE IF (pw1%in_use == COMPLEXDATA1D &
                  .AND. pw2%in_use == COMPLEXDATA1D) THEN
!$OMP PARALLEL WORKSHARE DEFAULT(NONE) SHARED(pw1, pw2, integral_value)
            integral_value = REAL(DOT_PRODUCT(CONJG(pw1%cc(:)), CONJG(pw2%cc(:))), KIND=dp) !? complex bit
!$OMP END PARALLEL WORKSHARE
         ELSE
            CPABORT("No possible DATA")
         END IF

         ! do accurate sum
      ELSE

         ! since the return value is real, only do accurate sum on the real bit ?
         IF (pw1%in_use == REALDATA3D .AND. pw2%in_use == REALDATA3D) THEN
            integral_value = accurate_sum(pw1%cr3d(:, :, :) &
                                          *pw2%cr3d(:, :, :))
         ELSE IF (pw1%in_use == REALDATA3D &
                  .AND. pw2%in_use == COMPLEXDATA3D) THEN
            integral_value = REAL(accurate_sum(pw1%cr3d(:, :, :) &
                                               *pw2%cc3d(:, :, :)), KIND=dp) !? complex bit
         ELSE IF (pw1%in_use == COMPLEXDATA3D &
                  .AND. pw2%in_use == REALDATA3D) THEN
            integral_value = REAL(accurate_sum(pw1%cc3d(:, :, :) &
                                               *pw2%cr3d(:, :, :)), KIND=dp) !? complex bit
         ELSE IF (pw1%in_use == COMPLEXDATA3D &
                  .AND. pw2%in_use == COMPLEXDATA3D) THEN
            integral_value = REAL(accurate_sum(CONJG(pw1%cc3d(:, :, :)) &
                                               *pw2%cc3d(:, :, :)), KIND=dp) !? complex bit

         ELSE IF (pw1%in_use == REALDATA1D &
                  .AND. pw2%in_use == REALDATA1D) THEN
            integral_value = accurate_sum(pw1%cr(:)*pw2%cr(:))
         ELSE IF (pw1%in_use == REALDATA1D &
                  .AND. pw2%in_use == COMPLEXDATA1D) THEN
            integral_value = REAL(accurate_sum(pw1%cr(:)*pw2%cc(:)), KIND=dp) !? complex bit
         ELSE IF (pw1%in_use == COMPLEXDATA1D &
                  .AND. pw2%in_use == REALDATA1D) THEN
            integral_value = REAL(accurate_sum(pw1%cc(:)*pw2%cr(:)), KIND=dp) !? complex bit
         ELSE IF (pw1%in_use == COMPLEXDATA1D &
                  .AND. pw2%in_use == COMPLEXDATA1D) THEN
            integral_value = REAL(accurate_sum(CONJG(pw1%cc(:))*pw2%cc(:)), KIND=dp) !? complex bit
         ELSE
            CPABORT("No possible DATA")
         END IF

      END IF

      IF (pw1%in_use == REALDATA3D .OR. pw1%in_use == COMPLEXDATA3D) THEN
         integral_value = integral_value*pw1%pw_grid%dvol
      ELSE
         integral_value = integral_value*pw1%pw_grid%vol
      END IF
      IF (pw1%in_use == COMPLEXDATA1D) THEN
         IF (pw1%pw_grid%grid_span == HALFSPACE) THEN
            integral_value = 2.0_dp*integral_value
            IF (pw1%pw_grid%have_g0) integral_value = integral_value - &
                                                      REAL(CONJG(pw1%cc(1))*pw2%cc(1), KIND=dp)
         END IF
      END IF

      IF (pw1%pw_grid%para%mode == PW_MODE_DISTRIBUTED) &
         CALL pw1%pw_grid%para%group%sum(integral_value)
      CALL timestop(handle)

   END FUNCTION pw_integral_ab

! **************************************************************************************************
!> \brief ...
!> \param pw1 ...
!> \param pw2 ...
!> \return ...
! **************************************************************************************************
   FUNCTION pw_integral_a2b(pw1, pw2) RESULT(integral_value)

      TYPE(pw_type), INTENT(IN)                          :: pw1, pw2
      REAL(KIND=dp)                                      :: integral_value

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_integral_a2b'

      INTEGER                                            :: handle

      CALL timeset(routineN, handle)
      IF (.NOT. ASSOCIATED(pw1%pw_grid, pw2%pw_grid)) THEN
         CPABORT("Grids incompatible")
      END IF
      IF (pw1%in_use == REALDATA1D .AND. &
          pw2%in_use == REALDATA1D) THEN
         integral_value = accurate_sum(pw1%cr(:)*pw2%cr(:) &
                                       *pw1%pw_grid%gsq(:))
      ELSE IF (pw1%in_use == COMPLEXDATA1D .AND. &
               pw2%in_use == COMPLEXDATA1D) THEN
         integral_value = accurate_sum(REAL(CONJG(pw1%cc(:)) &
                                            *pw2%cc(:), KIND=dp)*pw1%pw_grid%gsq(:))
         IF (pw1%pw_grid%grid_span == HALFSPACE) THEN
            integral_value = 2.0_dp*integral_value
         END IF
      ELSE
         CPABORT("No possible DATA")
      END IF

      IF (pw1%in_use == REALDATA3D .OR. pw1%in_use == COMPLEXDATA3D) THEN
         integral_value = integral_value*pw1%pw_grid%dvol
      ELSE
         integral_value = integral_value*pw1%pw_grid%vol
      END IF

      IF (pw1%pw_grid%para%mode == PW_MODE_DISTRIBUTED) &
         CALL pw1%pw_grid%para%group%sum(integral_value)
      CALL timestop(handle)

   END FUNCTION pw_integral_a2b

! **************************************************************************************************
!> \brief Calculate the structure factor for point r
!> \param sf ...
!> \param r ...
!> \par History
!>      none
!> \author JGH (05-May-2006)
!> \note
!>      PW has to be in RECIPROCALSPACE and data in use is COMPLEXDATA1D
! **************************************************************************************************
   SUBROUTINE pw_structure_factor(sf, r)

      TYPE(pw_type), INTENT(IN)                          :: sf
      REAL(KIND=dp), DIMENSION(:), INTENT(in)            :: r

      CHARACTER(len=*), PARAMETER :: routineN = 'pw_structure_factor'

      INTEGER                                            :: cnt, handle, ig
      REAL(KIND=dp)                                      :: arg, flop

      CALL timeset(routineN, handle)
      flop = 0.0_dp

      IF (sf%in_space == RECIPROCALSPACE .AND. &
          sf%in_use == COMPLEXDATA1D) THEN

         cnt = SIZE(sf%cc)

!$OMP PARALLEL DO PRIVATE (ig,arg) DEFAULT(NONE) SHARED(sf,r,cnt)
         DO ig = 1, cnt
            arg = DOT_PRODUCT(sf%pw_grid%g(:, ig), r)
            sf%cc(ig) = CMPLX(COS(arg), -SIN(arg), KIND=dp)
         END DO
         flop = flop + 7*cnt
      ELSE

         CPABORT("No suitable data field")

      END IF

      flop = flop*1.e-6_dp
      CALL timestop(handle)

   END SUBROUTINE pw_structure_factor

! **************************************************************************************************
!> \brief ...
!> \param fun ...
!> \param isign ...
!> \param oprt ...
!> \return ...
! **************************************************************************************************
   FUNCTION pw_integrate_function(fun, isign, oprt) RESULT(total_fun)

      TYPE(pw_type), INTENT(IN)                          :: fun
      INTEGER, INTENT(IN), OPTIONAL                      :: isign
      CHARACTER(len=*), INTENT(IN), OPTIONAL             :: oprt
      REAL(KIND=dp)                                      :: total_fun

      INTEGER                                            :: iop

      iop = 0
      IF (PRESENT(oprt)) THEN
         SELECT CASE (oprt)
         CASE ("ABS", "abs")
            iop = 1
         CASE DEFAULT
            CPABORT("Unknown operator")
         END SELECT
      END IF

      total_fun = 0._dp

      IF (fun%in_space == REALSPACE) THEN
         IF (fun%in_use == REALDATA3D) THEN
            ! do reduction using maximum accuracy
            IF (iop == 1) THEN
               total_fun = fun%pw_grid%dvol*accurate_sum(ABS(fun%cr3d))
            ELSE
               total_fun = fun%pw_grid%dvol*accurate_sum(fun%cr3d)
            END IF
         ELSE
            CPABORT("In_space/in_use combination not implemented")
         END IF
      ELSEIF (fun%in_space == RECIPROCALSPACE) THEN
         IF (iop == 1) &
            CPABORT("Operator ABS not implemented")
         IF (fun%in_use == COMPLEXDATA1D) THEN
            IF (fun%pw_grid%have_g0) total_fun = REAL(fun%pw_grid%vol*fun%cc(1), KIND=dp)
         ELSE
            CPABORT("In_space/in_use combination not implemented")
         END IF
      ELSE
         CPABORT("No space defined")
      END IF
      IF (fun%pw_grid%para%mode /= PW_MODE_LOCAL) THEN
         CALL fun%pw_grid%para%group%sum(total_fun)
      END IF
      IF (PRESENT(isign)) THEN
         total_fun = total_fun*SIGN(1._dp, REAL(isign, dp))
      END IF

   END FUNCTION pw_integrate_function

! **************************************************************************************************
!> \brief Initialize pw values using values from a real array data, which might be defined on a
!>        smaller grid than pw but which is contained in it
!> \param pw the pw to initialize
!> \param values the array holding the input data
!> \par History
!>      Created 12.2016
!> \author Nico Holmberg
! **************************************************************************************************
   SUBROUTINE pw_set(pw, values)

      TYPE(pw_type), INTENT(IN)                          :: pw
      REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
         POINTER                                         :: values

      CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_set'

      INTEGER                                            :: handle, i, j, k
      LOGICAL                                            :: is_match

      CALL timeset(routineN, handle)
      IF (pw%in_use == REALDATA3D) THEN
         is_match = .TRUE.
         is_match = is_match .AND. (LBOUND(values, 1) .GE. LBOUND(pw%cr3d, 1))
         is_match = is_match .AND. (UBOUND(values, 1) .LE. UBOUND(pw%cr3d, 1))
         is_match = is_match .AND. (LBOUND(values, 2) .GE. LBOUND(pw%cr3d, 2))
         is_match = is_match .AND. (UBOUND(values, 2) .LE. UBOUND(pw%cr3d, 2))
         is_match = is_match .AND. (LBOUND(values, 3) .GE. LBOUND(pw%cr3d, 3))
         is_match = is_match .AND. (UBOUND(values, 3) .LE. UBOUND(pw%cr3d, 3))
         IF (.NOT. is_match) &
            CPABORT("Incompatible data fields")
         DO i = LBOUND(values, 3), UBOUND(values, 3)
            DO j = LBOUND(values, 2), UBOUND(values, 2)
               DO k = LBOUND(values, 1), UBOUND(values, 1)
                  pw%cr3d(k, j, i) = values(k, j, i)
               END DO
            END DO
         END DO
      ELSE
         CPABORT("Illegal pw type, should be REALDATA3D.")
      END IF
      CALL timestop(handle)

   END SUBROUTINE pw_set

END MODULE pw_methods
