[openblas] 01/01: computational-error-of-dgemv_n.patch
Julian Taylor
jtaylor.debian at googlemail.com
Fri Mar 14 18:18:42 UTC 2014
This is an automated email from the git hooks/post-receive script.
jtaylor-guest pushed a commit to branch master
in repository openblas.
commit d867d0974d496cd9e7f39bb2d87edd6c1f70d074
Author: Julian Taylor <jtaylor.debian at googlemail.com>
Date: Fri Mar 14 19:17:23 2014 +0100
computational-error-of-dgemv_n.patch
backport fix for wrong dgemv results on core2 kernel, same issue applies
to cgemv but is not fixed yet upstream.
---
.../patches/computational-error-of-dgemv_n.patch | 447 +++++++++++++++++++++
debian/patches/series | 1 +
2 files changed, 448 insertions(+)
diff --git a/debian/patches/computational-error-of-dgemv_n.patch b/debian/patches/computational-error-of-dgemv_n.patch
new file mode 100644
index 0000000..500c1b5
--- /dev/null
+++ b/debian/patches/computational-error-of-dgemv_n.patch
@@ -0,0 +1,447 @@
+Origin: 2d557eb1e05eb, 9a557e90dafe
+Description: fix wrong result of dgemv_n
+Bug: https://github.com/xianyi/OpenBLAS/issues/340
+
+--- a/kernel/x86_64/dgemv_n.S
++++ b/kernel/x86_64/dgemv_n.S
+@@ -111,6 +111,9 @@
+ #define MM M
+ #endif
+
++#define TMP_M %r15
++#define Y2 %rbx
++
+ PROLOGUE
+ PROFCODE
+
+@@ -170,8 +173,9 @@
+ jge .L00t
+
+ movq MMM,M
+- addq I,M
++ addq M, I
+ jle .L999x
++ movq I, M
+
+ .L00t:
+ movq XX,X
+@@ -2463,21 +2467,23 @@
+ cmpq Y, BUFFER
+ je .L999
+ #endif
+-
++ movq M, TMP_M
++ movq Y, Y1
++
+ cmpq $SIZE, INCY
+ jne .L950
+
+- testq $SIZE, Y
++ testq $SIZE, Y1
+ je .L910
+
+- movsd (Y), %xmm0
++ movsd (Y1), %xmm0
+ addsd (BUFFER), %xmm0
+- movsd %xmm0, (Y)
++ movsd %xmm0, (Y1)
+
+- addq $SIZE, Y
++ addq $SIZE, Y1
+ addq $SIZE, BUFFER
+
+- decq M
++ decq TMP_M
+ jle .L999
+ ALIGN_4
+
+@@ -2485,20 +2491,20 @@
+ testq $SIZE, BUFFER
+ jne .L920
+
+- movq M, %rax
++ movq TMP_M, %rax
+ sarq $3, %rax
+ jle .L914
+ ALIGN_3
+
+ .L912:
+ #ifdef PREFETCHW
+- PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
++ PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
+ #endif
+
+- movapd 0 * SIZE(Y), %xmm0
+- movapd 2 * SIZE(Y), %xmm1
+- movapd 4 * SIZE(Y), %xmm2
+- movapd 6 * SIZE(Y), %xmm3
++ movapd 0 * SIZE(Y1), %xmm0
++ movapd 2 * SIZE(Y1), %xmm1
++ movapd 4 * SIZE(Y1), %xmm2
++ movapd 6 * SIZE(Y1), %xmm3
+
+ movapd 0 * SIZE(BUFFER), %xmm4
+ movapd 2 * SIZE(BUFFER), %xmm5
+@@ -2514,12 +2520,12 @@
+ addpd %xmm6, %xmm2
+ addpd %xmm7, %xmm3
+
+- movapd %xmm0, 0 * SIZE(Y)
+- movapd %xmm1, 2 * SIZE(Y)
+- movapd %xmm2, 4 * SIZE(Y)
+- movapd %xmm3, 6 * SIZE(Y)
++ movapd %xmm0, 0 * SIZE(Y1)
++ movapd %xmm1, 2 * SIZE(Y1)
++ movapd %xmm2, 4 * SIZE(Y1)
++ movapd %xmm3, 6 * SIZE(Y1)
+
+- addq $8 * SIZE, Y
++ addq $8 * SIZE, Y1
+ addq $8 * SIZE, BUFFER
+
+ decq %rax
+@@ -2527,14 +2533,14 @@
+ ALIGN_3
+
+ .L914:
+- testq $7, M
++ testq $7, TMP_M
+ jle .L999
+
+- testq $4, M
++ testq $4, TMP_M
+ jle .L915
+
+- movapd 0 * SIZE(Y), %xmm0
+- movapd 2 * SIZE(Y), %xmm1
++ movapd 0 * SIZE(Y1), %xmm0
++ movapd 2 * SIZE(Y1), %xmm1
+
+ movapd 0 * SIZE(BUFFER), %xmm4
+ movapd 2 * SIZE(BUFFER), %xmm5
+@@ -2542,40 +2548,40 @@
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+
+- movapd %xmm0, 0 * SIZE(Y)
+- movapd %xmm1, 2 * SIZE(Y)
++ movapd %xmm0, 0 * SIZE(Y1)
++ movapd %xmm1, 2 * SIZE(Y1)
+
+- addq $4 * SIZE, Y
++ addq $4 * SIZE, Y1
+ addq $4 * SIZE, BUFFER
+ ALIGN_3
+
+ .L915:
+- testq $2, M
++ testq $2, TMP_M
+ jle .L916
+
+- movapd (Y), %xmm0
++ movapd (Y1), %xmm0
+
+ movapd (BUFFER), %xmm4
+
+ addpd %xmm4, %xmm0
+
+- movapd %xmm0, (Y)
++ movapd %xmm0, (Y1)
+
+- addq $2 * SIZE, Y
++ addq $2 * SIZE, Y1
+ addq $2 * SIZE, BUFFER
+ ALIGN_3
+
+ .L916:
+- testq $1, M
++ testq $1, TMP_M
+ jle .L999
+
+- movsd (Y), %xmm0
++ movsd (Y1), %xmm0
+
+ movsd 0 * SIZE(BUFFER), %xmm4
+
+ addsd %xmm4, %xmm0
+
+- movlpd %xmm0, (Y)
++ movlpd %xmm0, (Y1)
+ ALIGN_3
+
+ jmp .L999
+@@ -2584,20 +2590,20 @@
+ .L920:
+ movapd -1 * SIZE(BUFFER), %xmm4
+
+- movq M, %rax
++ movq TMP_M, %rax
+ sarq $3, %rax
+ jle .L924
+ ALIGN_3
+
+ .L922:
+ #ifdef PREFETCHW
+- PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
++ PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
+ #endif
+
+- movapd 0 * SIZE(Y), %xmm0
+- movapd 2 * SIZE(Y), %xmm1
+- movapd 4 * SIZE(Y), %xmm2
+- movapd 6 * SIZE(Y), %xmm3
++ movapd 0 * SIZE(Y1), %xmm0
++ movapd 2 * SIZE(Y1), %xmm1
++ movapd 4 * SIZE(Y1), %xmm2
++ movapd 6 * SIZE(Y1), %xmm3
+
+ movapd 1 * SIZE(BUFFER), %xmm5
+ movapd 3 * SIZE(BUFFER), %xmm6
+@@ -2618,14 +2624,14 @@
+ addpd %xmm6, %xmm2
+ addpd %xmm7, %xmm3
+
+- movapd %xmm0, 0 * SIZE(Y)
+- movapd %xmm1, 2 * SIZE(Y)
+- movapd %xmm2, 4 * SIZE(Y)
+- movapd %xmm3, 6 * SIZE(Y)
++ movapd %xmm0, 0 * SIZE(Y1)
++ movapd %xmm1, 2 * SIZE(Y1)
++ movapd %xmm2, 4 * SIZE(Y1)
++ movapd %xmm3, 6 * SIZE(Y1)
+
+ movapd %xmm8, %xmm4
+
+- addq $8 * SIZE, Y
++ addq $8 * SIZE, Y1
+ addq $8 * SIZE, BUFFER
+
+ decq %rax
+@@ -2633,14 +2639,14 @@
+ ALIGN_3
+
+ .L924:
+- testq $7, M
++ testq $7, TMP_M
+ jle .L999
+
+- testq $4, M
++ testq $4, TMP_M
+ jle .L925
+
+- movapd 0 * SIZE(Y), %xmm0
+- movapd 2 * SIZE(Y), %xmm1
++ movapd 0 * SIZE(Y1), %xmm0
++ movapd 2 * SIZE(Y1), %xmm1
+
+ movapd 1 * SIZE(BUFFER), %xmm5
+ movapd 3 * SIZE(BUFFER), %xmm6
+@@ -2651,20 +2657,20 @@
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+
+- movapd %xmm0, 0 * SIZE(Y)
+- movapd %xmm1, 2 * SIZE(Y)
++ movapd %xmm0, 0 * SIZE(Y1)
++ movapd %xmm1, 2 * SIZE(Y1)
+
+ movapd %xmm6, %xmm4
+
+- addq $4 * SIZE, Y
++ addq $4 * SIZE, Y1
+ addq $4 * SIZE, BUFFER
+ ALIGN_3
+
+ .L925:
+- testq $2, M
++ testq $2, TMP_M
+ jle .L926
+
+- movapd (Y), %xmm0
++ movapd (Y1), %xmm0
+
+ movapd 1 * SIZE(BUFFER), %xmm5
+
+@@ -2672,25 +2678,25 @@
+
+ addpd %xmm4, %xmm0
+
+- movapd %xmm0, (Y)
++ movapd %xmm0, (Y1)
+
+ movaps %xmm5, %xmm4
+
+- addq $2 * SIZE, Y
++ addq $2 * SIZE, Y1
+ addq $2 * SIZE, BUFFER
+ ALIGN_3
+
+ .L926:
+- testq $1, M
++ testq $1, TMP_M
+ jle .L999
+
+- movsd (Y), %xmm0
++ movsd (Y1), %xmm0
+
+ shufpd $1, %xmm4, %xmm4
+
+ addsd %xmm4, %xmm0
+
+- movlpd %xmm0, (Y)
++ movlpd %xmm0, (Y1)
+ ALIGN_3
+
+ jmp .L999
+@@ -2700,53 +2706,53 @@
+ testq $SIZE, BUFFER
+ je .L960
+
+- movsd (Y), %xmm0
++ movsd (Y1), %xmm0
+ addsd (BUFFER), %xmm0
+- movsd %xmm0, (Y)
++ movsd %xmm0, (Y1)
+
+- addq INCY, Y
++ addq INCY, Y1
+ addq $SIZE, BUFFER
+
+- decq M
++ decq TMP_M
+ jle .L999
+ ALIGN_4
+
+ .L960:
+- movq Y, Y1
++ movq Y1, Y2
+
+- movq M, %rax
++ movq TMP_M, %rax
+ sarq $3, %rax
+ jle .L964
+ ALIGN_3
+
+ .L962:
+- movsd (Y), %xmm0
+- addq INCY, Y
+- movhpd (Y), %xmm0
+- addq INCY, Y
++ movsd (Y2), %xmm0
++ addq INCY, Y2
++ movhpd (Y2), %xmm0
++ addq INCY, Y2
+
+ movapd 0 * SIZE(BUFFER), %xmm4
+
+- movsd (Y), %xmm1
+- addq INCY, Y
+- movhpd (Y), %xmm1
+- addq INCY, Y
++ movsd (Y2), %xmm1
++ addq INCY, Y2
++ movhpd (Y2), %xmm1
++ addq INCY, Y2
+
+ movapd 2 * SIZE(BUFFER), %xmm5
+
+- movsd (Y), %xmm2
+- addq INCY, Y
+- movhpd (Y), %xmm2
+- addq INCY, Y
++ movsd (Y2), %xmm2
++ addq INCY, Y2
++ movhpd (Y2), %xmm2
++ addq INCY, Y2
+
+ movapd 4 * SIZE(BUFFER), %xmm6
+
+ addpd %xmm4, %xmm0
+
+- movsd (Y), %xmm3
+- addq INCY, Y
+- movhpd (Y), %xmm3
+- addq INCY, Y
++ movsd (Y2), %xmm3
++ addq INCY, Y2
++ movhpd (Y2), %xmm3
++ addq INCY, Y2
+
+ movapd 6 * SIZE(BUFFER), %xmm7
+
+@@ -2781,23 +2787,23 @@
+ ALIGN_3
+
+ .L964:
+- testq $7, M
++ testq $7, TMP_M
+ jle .L999
+
+- testq $4, M
++ testq $4, TMP_M
+ jle .L965
+
+- movsd (Y), %xmm0
+- addq INCY, Y
+- movhpd (Y), %xmm0
+- addq INCY, Y
++ movsd (Y2), %xmm0
++ addq INCY, Y2
++ movhpd (Y2), %xmm0
++ addq INCY, Y2
+
+ movapd 0 * SIZE(BUFFER), %xmm4
+
+- movsd (Y), %xmm1
+- addq INCY, Y
+- movhpd (Y), %xmm1
+- addq INCY, Y
++ movsd (Y2), %xmm1
++ addq INCY, Y2
++ movhpd (Y2), %xmm1
++ addq INCY, Y2
+
+ movapd 2 * SIZE(BUFFER), %xmm5
+
+@@ -2817,13 +2823,13 @@
+ ALIGN_3
+
+ .L965:
+- testq $2, M
++ testq $2, TMP_M
+ jle .L966
+
+- movsd (Y), %xmm0
+- addq INCY, Y
+- movhpd (Y), %xmm0
+- addq INCY, Y
++ movsd (Y2), %xmm0
++ addq INCY, Y2
++ movhpd (Y2), %xmm0
++ addq INCY, Y2
+
+ movapd 0 * SIZE(BUFFER), %xmm4
+
+@@ -2838,10 +2844,10 @@
+ ALIGN_3
+
+ .L966:
+- testq $1, M
++ testq $1, TMP_M
+ jle .L999
+
+- movsd (Y), %xmm0
++ movsd (Y2), %xmm0
+
+ movsd 0 * SIZE(BUFFER), %xmm4
+
+@@ -2853,6 +2859,9 @@
+ .L999:
+ leaq (, M, SIZE), %rax
+ addq %rax,AA
++ movq STACK_INCY, INCY
++ imulq INCY, %rax
++ addq %rax, Y
+ jmp .L0t
+ ALIGN_4
+
diff --git a/debian/patches/series b/debian/patches/series
index ea4b8a3..c82d882 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -6,3 +6,4 @@ remove-openmp-warning.patch
fork-handler.patch
dgemv-crash.patch
wrong-parameter-for-zherk-zher2.patch
+computational-error-of-dgemv_n.patch
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/openblas.git
More information about the debian-science-commits
mailing list