#include <print>

float
dotProduct (float a[], float b[])
{
  // Mask 0xF1:
  //   [7:4] = 1111 (Multiply all 4 pairs)
  //   [3:0] = 0001 (Store result only in the lowest 32 bits of the output)
  const unsigned char mask = 0xF1;

  float result {};
  // asm ( code : outputs : inputs : clobbers );
  // For outputs, syntax is 
  //   [name] "constraint" (variable)
  // For inputs, syntax is 
  //   [name] "constraint" (expression)
  asm (
       R"(
           movups %[v1], %%xmm0
           movups %[v2], %%xmm1      
           dpps %[imm], %%xmm1, %%xmm0 
           movss %%xmm0, %[res]  
        )"
       : [res] "=m" (result)       // Outputs
       : [v1] "m" (*a),            // Inputs
         [v2] "m" (*b),
         [imm] "i" (mask)
       : "%xmm0", "%xmm1"          // Clobbered registers
  );

  return result;
}

int
main ()
{
  float x[] { 1.0f, 2.0f, 3.0f, 4.0f };
  float y[] { 5.0f, 6.0f, 7.0f, 8.0f };

  std::println ("Dot Product: {}", dotProduct (x, y));
}
