alginment for load and store

suchismith1993 · Nov 11, 2024 · 79564da · 79564da
1 parent f865ac4
commit 79564da
Showing 1 changed file with 72 additions and 49 deletions.
diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
@@ -71,7 +71,6 @@
 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
 #endif
 
- int fubar=0;
 class StubGenerator: public StubCodeGenerator {
  private:
 
@@ -738,23 +737,17 @@ class StubGenerator: public StubCodeGenerator {
 address generate_ghash_processBlocks() {
   StubCodeMark mark(this, "StubRoutines", "ghash");
   address start = __ function_entry();
-
- // Register ofs = R4_ARG2;  // int ofs
-   // int blocks
   Register state = R3_ARG1;  // long[] st
-  Register subkeyH = R4_ARG2;
-  Register data = R5_ARG3;  // byte[] data  // long[] subH
+  Register subkeyH = R4_ARG2; // long[] subH
+  Register data = R5_ARG3;  // byte[] data  
   Register blocks = R6_ARG4; 
  // __ stop("ghash start");
-
   // Temporary registers
   Register temp1 = R8;
   Register temp2 = R9;
   Register temp3 = R10;
   Register temp4 = R11;
   Register align = data;
-  Register fubar_addr = R12;
-  Register fubar_value = R13;
   VectorRegister vH = VR0;
   VectorRegister vX = VR1;
   VectorRegister vH_shift = VR2;
@@ -778,52 +771,62 @@ address generate_ghash_processBlocks() {
   VectorRegister vHigh = VR20;
   VectorRegister vLow = VR21;
   VectorRegister vPerm = VR22;
-
- Label   L_end, L_aligned, L_align_2,L_end_2;
-
-
-
-static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
-static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  VectorRegister vZero_Stored = VR23;
+  VectorRegister vMask = VR24;
+  VectorRegister vS = VR25;
+  Label   L_end, L_aligned, L_align_2,L_end_2,L_aligned3,L_end3,L_aligned4,L_end4;
+  static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+  static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
 // Load the address of perm_pattern
-__ load_const_optimized(temp1, (uintptr_t)&perm_pattern);
-__ load_const_optimized(temp2, (uintptr_t)&perm_pattern2);
-__ li(temp3,0);
-// Load the 128-bit vector from memory
-__ vxor(fromPerm, fromPerm, fromPerm);  // Clear the vector register
-__ lvxl(fromPerm, temp3,  temp1);  // Lo
-__ li(temp1, 0xc2);
- __ sldi(temp1, temp1, 56);
+  __ load_const_optimized(temp1, (uintptr_t)&perm_pattern);
+  __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2);
+  __ li(temp3,0);
+  __ vxor(fromPerm, fromPerm, fromPerm);  // Clear the vector register
+  __ lvxl(fromPerm, temp3,  temp1);  // Lo
+  __ li(temp1, 0xc2);
+  __ sldi(temp1, temp1, 56);
   // Load the vector from memory into vConstC2
   __ vxor(vConstC2,vConstC2,vConstC2);
   __ mtvrd(vConstC2, temp1);
   __ vxor(vZero, vZero, vZero);
   // Load H into vector registsiers
   // Use a different register (e.g., R3)
-   __ li(temp1, 0);
+  __ li(temp1, 0);
   __ andi(temp1, subkeyH, 15);
   __ cmpwi(CCR0,temp1,0);
   __ beq(CCR0, L_aligned);         // Check if 'to' is aligned (mask lower 4 bits)
   __ li(temp1, 0);      // Load immediate value 0 into temp  
   __ vxor(vH,vH,vH);
-
-
   __ lvx(vHigh, temp1, subkeyH);  // Load H using temp instead of R0
   __ lvsl(vPerm,temp1,subkeyH);
   __ addi(subkeyH,subkeyH,16);
   __ lvx(vLow,temp1,subkeyH);
   __ vec_perm(vH, vHigh, vLow, vPerm);
   __ subi(subkeyH,subkeyH,16);
-
   __ b(L_end);
   __ bind(L_aligned);
   __ lvx(vH,temp1,subkeyH);
   __ bind(L_end);
   __ vec_perm(vH, vH, vH, fromPerm);
-
-  __ vspltisb(vConst1, 1); // Vector with 1s
-  __ vspltisb(vConst7, 7); // Vector with 7s
-
+  __ li(temp1, 0);
+  __ andi(temp1, state, 15);
+  __ cmpwi(CCR0,temp1,0);
+  __ beq(CCR0, L_aligned3);// Check if 'to' is aligned (mask lower 4 bits)
+  __ li(temp1, 0);      // Load immediate value 0 into temp  
+  __ vxor(vZero_Stored,vZero_Stored,vZero_Stored);
+  __ lvx(vHigh, temp1, state);  // Load H using temp instead of R0
+  __ lvsl(vPerm,temp1,state);
+  __ addi(state, state, 16);
+  __ lvx(vLow, temp1, state);
+  __ vec_perm(vZero_Stored, vHigh, vLow, vPerm);
+  __ subi(state,state,16);
+  __ b(L_end3);
+  __ bind(L_aligned3);
+  __ lvx(vZero_Stored,temp1,state);
+  __ bind(L_end3);
+  __ vec_perm(vZero_Stored, vZero_Stored, vZero_Stored, fromPerm);
+  __ vspltisb(vConst1, 1); 
+  __ vspltisb(vConst7, 7);
   __ vsldoi(vTmp4, vZero, vConst1, 1);  // 0x1
   __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1
   __ vsplt(vMSB, 0, vH); // MSB of H
@@ -832,30 +835,29 @@ __ li(temp1, 0xc2);
   __ vsrab(vMSB, vMSB, vConst7);
   __ vand(vMSB, vMSB, vTmp4); //Carry
   __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1
- // vsldoi 19, 0, 18, 8    
   __ vsldoi(vConstC2, vZero, vConstC2, 8);
   __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H 
   __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L
   __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H
-
-    __ vxor(vTmp1, vTmp1, vTmp1);
-    __ vxor(vZero, vZero, vZero);
-
+  __ vxor(vTmp1, vTmp1, vTmp1);
+  __ vxor(vZero, vZero, vZero);
     // Calculate the number of blocks
 
-    __ mtctr(blocks);
-    __ li(temp1, 0);
-    __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2);
+  __ mtctr(blocks);
+  __ li(temp1, 0);
+  __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2);
+
     Label loop;
     __ bind(loop);
 
    // Load immediate value 0 into temp  
     __ vxor(vX,vX,vX);
+    __ vxor(vZero, vZero, vZero);
     __ li(temp1,0);
-
+    //alignment
     __ andi(temp1, data, 15);
     __ cmpwi(CCR0,temp1,0);
-    __  beq(CCR0, L_align_2);
+    __ beq(CCR0, L_align_2);
     __ li(temp1,0);
     __ lvx(vHigh,temp1,align);
     __ lvsl(fromPerm,temp1,align);
@@ -866,34 +868,55 @@ __ li(temp1, 0xc2);
     __ b(L_end_2);
     __ bind(L_align_2);
     __ lvx(vX,temp1,data);
-
-     __ bind(L_end_2);
+    __ bind(L_end_2);
     __ lvx(fromPerm,  temp2);
     __ vec_perm(vX, vX, vX, fromPerm);
-    __ addi(temp1, temp1, 16);
+
 
+    __ vxor(vX,vX,vZero_Stored);
     // Perform GCM multiplication
     __ vpmsumd(vTmp1, vLowerH, vX);  // L
     __ vpmsumd(vTmp2, vSwappedH, vX);        // M
     __ vpmsumd(vTmp3, vHigherH, vX);  // H
     __ vpmsumd(vTmp4, vTmp1, vConstC2);  // reduction
-
     __ vsldoi(vTmp5, vTmp2, vZero, 8);  // mL
     __ vsldoi(vTmp6, vZero, vTmp2, 8);  // mH
-
     __ vxor(vTmp1, vTmp1, vTmp5);    // LL + LL
     __ vxor(vTmp3, vTmp3, vTmp6);    // HH + HH
-
     __ vsldoi(vTmp1, vTmp1, vTmp1, 8);  // swap
     __ vxor(vTmp1, vTmp1, vTmp4);       // reduction
-
     __ vsldoi(vTmp7, vTmp1, vTmp1, 8);  // swap
     __ vpmsumd(vTmp1, vTmp1, vConstC2);  // reduction
     __ vxor(vTmp7, vTmp7, vTmp3);
     __ vxor(vZero, vTmp1, vTmp7);
+    __ vmr(vZero_Stored, vZero);
+    __ addi(data, data , 16);
     __ bdnz(loop);
     __ li(temp4, 0);
+    __ load_const_optimized(temp1, (uintptr_t)&perm_pattern);
+    __ lvx(fromPerm,  temp1);
+    __ vec_perm(vZero, vZero, vZero, fromPerm);
+    __ li(temp1, 0);
+    __ andi(temp1, state, 15);
+    __ cmpwi(CCR0,temp1,0);
+    __ beq(CCR0, L_aligned4);// Check if 'to' is aligned (mask lower 4 bits)
+    __ lvx(vHigh,temp4,state);
+    __ lvsr(vPerm,temp4,state);
+    __ addi(state,state,16);
+    __ lvx(vLow,temp4,state);
+    __ vspltisb(vConst1, -1); // Vector with 1s
+    __ vspltisb(vConst7, 0); // Vector with 7s
+    __ vec_perm(vMask,vConst7,vConst1,vPerm);
+    __ vec_perm(vZero,vZero,vZero,vPerm);
+    __ vsel(vLow,vZero,vLow,vMask);
+    __ vsel(vHigh,vHigh,vZero,vMask);
+    __ stvx(vLow,temp4,state);
+    __ addi(state,state,-16);
+    __ stvx(vHigh,temp4,state);
+    __ b(L_end4);
+    __ bind(L_aligned4);
     __ stvx(vZero, temp4, state); 
+    __ bind(L_end4);
     __ blr();  // Return from function
   return start;