Skip to content
Snippets Groups Projects
Commit 194ee2d8 authored by Matthias Keck's avatar Matthias Keck
Browse files

Delete job_output_19274.log

parent e654c94a
No related merge requests found
----benchmark.py Matze-----
### causal=False, headdim=64, batch_size=32, seqlen=512 ###
Flash2 fwd: 124.00 TFLOPs/s, bwd: 112.94 TFLOPs/s, fwd + bwd: 115.90 TFLOPs/s
Flash fwd: 113.09 TFLOPs/s, bwd: 48.43 TFLOPs/s, fwd + bwd: 57.89 TFLOPs/s
Pytorch fwd: 26.78 TFLOPs/s, bwd: 32.72 TFLOPs/s, fwd + bwd: 30.77 TFLOPs/s
### causal=False, headdim=64, batch_size=16, seqlen=1024 ###
Flash2 fwd: 177.25 TFLOPs/s, bwd: 138.19 TFLOPs/s, fwd + bwd: 147.47 TFLOPs/s
Flash fwd: 225.35 TFLOPs/s, bwd: 97.02 TFLOPs/s, fwd + bwd: 115.87 TFLOPs/s
Pytorch fwd: 30.95 TFLOPs/s, bwd: 36.50 TFLOPs/s, fwd + bwd: 34.72 TFLOPs/s
### causal=False, headdim=64, batch_size=8, seqlen=2048 ###
Flash2 fwd: 182.79 TFLOPs/s, bwd: 154.32 TFLOPs/s, fwd + bwd: 161.51 TFLOPs/s
Flash fwd: 446.43 TFLOPs/s, bwd: 194.16 TFLOPs/s, fwd + bwd: 231.55 TFLOPs/s
Pytorch fwd: 22.10 TFLOPs/s, bwd: 39.29 TFLOPs/s, fwd + bwd: 32.15 TFLOPs/s
### causal=False, headdim=64, batch_size=4, seqlen=4096 ###
Flash2 fwd: 182.21 TFLOPs/s, bwd: 165.91 TFLOPs/s, fwd + bwd: 170.26 TFLOPs/s
Flash fwd: 891.89 TFLOPs/s, bwd: 389.60 TFLOPs/s, fwd + bwd: 464.31 TFLOPs/s
Pytorch fwd: 30.22 TFLOPs/s, bwd: 40.62 TFLOPs/s, fwd + bwd: 36.99 TFLOPs/s
### causal=False, headdim=64, batch_size=2, seqlen=8192 ###
Flash2 fwd: 181.59 TFLOPs/s, bwd: 171.85 TFLOPs/s, fwd + bwd: 174.52 TFLOPs/s
Flash fwd: 1760.53 TFLOPs/s, bwd: 778.38 TFLOPs/s, fwd + bwd: 925.97 TFLOPs/s
Pytorch fwd: 36.51 TFLOPs/s, bwd: 41.64 TFLOPs/s, fwd + bwd: 40.03 TFLOPs/s
### causal=False, headdim=64, batch_size=1, seqlen=16384 ###
Flash2 fwd: 181.60 TFLOPs/s, bwd: 174.34 TFLOPs/s, fwd + bwd: 176.35 TFLOPs/s
Flash fwd: 3538.54 TFLOPs/s, bwd: 1562.19 TFLOPs/s, fwd + bwd: 1858.81 TFLOPs/s
Pytorch fwd: 0.00 TFLOPs/s, bwd: 0.00 TFLOPs/s, fwd + bwd: 0.00 TFLOPs/s
### causal=False, headdim=128, batch_size=32, seqlen=512 ###
Flash2 fwd: 186.08 TFLOPs/s, bwd: 118.85 TFLOPs/s, fwd + bwd: 132.53 TFLOPs/s
Flash fwd: 122.46 TFLOPs/s, bwd: 50.12 TFLOPs/s, fwd + bwd: 60.29 TFLOPs/s
Pytorch fwd: 38.29 TFLOPs/s, bwd: 51.14 TFLOPs/s, fwd + bwd: 46.67 TFLOPs/s
### causal=False, headdim=128, batch_size=16, seqlen=1024 ###
Flash2 fwd: 211.56 TFLOPs/s, bwd: 147.57 TFLOPs/s, fwd + bwd: 161.53 TFLOPs/s
Flash fwd: 239.83 TFLOPs/s, bwd: 99.56 TFLOPs/s, fwd + bwd: 119.53 TFLOPs/s
Pytorch fwd: 50.02 TFLOPs/s, bwd: 62.96 TFLOPs/s, fwd + bwd: 58.62 TFLOPs/s
### causal=False, headdim=128, batch_size=8, seqlen=2048 ###
Flash2 fwd: 216.34 TFLOPs/s, bwd: 166.11 TFLOPs/s, fwd + bwd: 177.91 TFLOPs/s
Flash fwd: 476.83 TFLOPs/s, bwd: 198.49 TFLOPs/s, fwd + bwd: 238.22 TFLOPs/s
Pytorch fwd: 40.28 TFLOPs/s, bwd: 72.05 TFLOPs/s, fwd + bwd: 58.80 TFLOPs/s
### causal=False, headdim=128, batch_size=4, seqlen=4096 ###
Flash2 fwd: 218.39 TFLOPs/s, bwd: 180.89 TFLOPs/s, fwd + bwd: 190.23 TFLOPs/s
Flash fwd: 953.22 TFLOPs/s, bwd: 396.59 TFLOPs/s, fwd + bwd: 476.01 TFLOPs/s
Pytorch fwd: 54.02 TFLOPs/s, bwd: 74.23 TFLOPs/s, fwd + bwd: 67.06 TFLOPs/s
### causal=False, headdim=128, batch_size=2, seqlen=8192 ###
Flash2 fwd: 213.65 TFLOPs/s, bwd: 187.38 TFLOPs/s, fwd + bwd: 194.20 TFLOPs/s
Flash fwd: 1902.26 TFLOPs/s, bwd: 793.37 TFLOPs/s, fwd + bwd: 951.91 TFLOPs/s
Pytorch fwd: 66.05 TFLOPs/s, bwd: 69.02 TFLOPs/s, fwd + bwd: 68.14 TFLOPs/s
### causal=False, headdim=128, batch_size=1, seqlen=16384 ###
Flash2 fwd: 214.78 TFLOPs/s, bwd: 190.87 TFLOPs/s, fwd + bwd: 197.14 TFLOPs/s
Flash fwd: 3797.81 TFLOPs/s, bwd: 1591.38 TFLOPs/s, fwd + bwd: 1908.11 TFLOPs/s
Pytorch fwd: 72.72 TFLOPs/s, bwd: 81.53 TFLOPs/s, fwd + bwd: 78.80 TFLOPs/s
### causal=True, headdim=64, batch_size=32, seqlen=512 ###
Flash2 fwd: 85.49 TFLOPs/s, bwd: 62.78 TFLOPs/s, fwd + bwd: 67.93 TFLOPs/s
Flash fwd: 45.63 TFLOPs/s, bwd: 21.06 TFLOPs/s, fwd + bwd: 24.89 TFLOPs/s
Pytorch fwd: 9.04 TFLOPs/s, bwd: 16.34 TFLOPs/s, fwd + bwd: 13.28 TFLOPs/s
### causal=True, headdim=64, batch_size=16, seqlen=1024 ###
Flash2 fwd: 138.83 TFLOPs/s, bwd: 104.60 TFLOPs/s, fwd + bwd: 112.52 TFLOPs/s
Flash fwd: 115.53 TFLOPs/s, bwd: 48.26 TFLOPs/s, fwd + bwd: 57.89 TFLOPs/s
Pytorch fwd: 9.90 TFLOPs/s, bwd: 18.23 TFLOPs/s, fwd + bwd: 14.69 TFLOPs/s
### causal=True, headdim=64, batch_size=8, seqlen=2048 ###
Flash2 fwd: 160.94 TFLOPs/s, bwd: 129.94 TFLOPs/s, fwd + bwd: 137.51 TFLOPs/s
Flash fwd: 226.23 TFLOPs/s, bwd: 96.67 TFLOPs/s, fwd + bwd: 115.58 TFLOPs/s
Pytorch fwd: 6.75 TFLOPs/s, bwd: 19.72 TFLOPs/s, fwd + bwd: 12.74 TFLOPs/s
### causal=True, headdim=64, batch_size=4, seqlen=4096 ###
Flash2 fwd: 171.01 TFLOPs/s, bwd: 149.51 TFLOPs/s, fwd + bwd: 155.08 TFLOPs/s
Flash fwd: 457.93 TFLOPs/s, bwd: 192.21 TFLOPs/s, fwd + bwd: 230.41 TFLOPs/s
Pytorch fwd: 8.53 TFLOPs/s, bwd: 20.33 TFLOPs/s, fwd + bwd: 14.57 TFLOPs/s
### causal=True, headdim=64, batch_size=2, seqlen=8192 ###
Flash2 fwd: 175.77 TFLOPs/s, bwd: 162.00 TFLOPs/s, fwd + bwd: 165.71 TFLOPs/s
Flash fwd: 912.79 TFLOPs/s, bwd: 387.71 TFLOPs/s, fwd + bwd: 463.97 TFLOPs/s
Pytorch fwd: 0.00 TFLOPs/s, bwd: 0.00 TFLOPs/s, fwd + bwd: 0.00 TFLOPs/s
### causal=True, headdim=64, batch_size=1, seqlen=16384 ###
Flash2 fwd: 180.68 TFLOPs/s, bwd: 170.68 TFLOPs/s, fwd + bwd: 173.42 TFLOPs/s
Flash fwd: 1811.72 TFLOPs/s, bwd: 776.12 TFLOPs/s, fwd + bwd: 927.62 TFLOPs/s
Pytorch fwd: 0.00 TFLOPs/s, bwd: 0.00 TFLOPs/s, fwd + bwd: 0.00 TFLOPs/s
### causal=True, headdim=128, batch_size=32, seqlen=512 ###
Flash2 fwd: 100.43 TFLOPs/s, bwd: 78.44 TFLOPs/s, fwd + bwd: 83.67 TFLOPs/s
Flash fwd: 62.08 TFLOPs/s, bwd: 25.04 TFLOPs/s, fwd + bwd: 30.19 TFLOPs/s
Pytorch fwd: 14.17 TFLOPs/s, bwd: 25.63 TFLOPs/s, fwd + bwd: 20.82 TFLOPs/s
### causal=True, headdim=128, batch_size=16, seqlen=1024 ###
Flash2 fwd: 155.09 TFLOPs/s, bwd: 112.71 TFLOPs/s, fwd + bwd: 122.26 TFLOPs/s
Flash fwd: 123.34 TFLOPs/s, bwd: 50.04 TFLOPs/s, fwd + bwd: 60.27 TFLOPs/s
Pytorch fwd: 17.16 TFLOPs/s, bwd: 31.57 TFLOPs/s, fwd + bwd: 25.46 TFLOPs/s
### causal=True, headdim=128, batch_size=8, seqlen=2048 ###
Flash2 fwd: 179.23 TFLOPs/s, bwd: 138.57 TFLOPs/s, fwd + bwd: 148.17 TFLOPs/s
Flash fwd: 242.34 TFLOPs/s, bwd: 99.95 TFLOPs/s, fwd + bwd: 120.11 TFLOPs/s
Pytorch fwd: 12.70 TFLOPs/s, bwd: 36.13 TFLOPs/s, fwd + bwd: 23.66 TFLOPs/s
### causal=True, headdim=128, batch_size=4, seqlen=4096 ###
Flash2 fwd: 191.16 TFLOPs/s, bwd: 160.14 TFLOPs/s, fwd + bwd: 167.92 TFLOPs/s
Flash fwd: 490.62 TFLOPs/s, bwd: 199.06 TFLOPs/s, fwd + bwd: 239.77 TFLOPs/s
Pytorch fwd: 15.84 TFLOPs/s, bwd: 37.08 TFLOPs/s, fwd + bwd: 26.81 TFLOPs/s
### causal=True, headdim=128, batch_size=2, seqlen=8192 ###
Flash2 fwd: 193.90 TFLOPs/s, bwd: 173.11 TFLOPs/s, fwd + bwd: 178.58 TFLOPs/s
Flash fwd: 965.87 TFLOPs/s, bwd: 399.06 TFLOPs/s, fwd + bwd: 479.45 TFLOPs/s
Pytorch fwd: 18.55 TFLOPs/s, bwd: 39.99 TFLOPs/s, fwd + bwd: 30.06 TFLOPs/s
### causal=True, headdim=128, batch_size=1, seqlen=16384 ###
Flash2 fwd: 195.56 TFLOPs/s, bwd: 181.91 TFLOPs/s, fwd + bwd: 185.61 TFLOPs/s
Flash fwd: 1920.67 TFLOPs/s, bwd: 798.06 TFLOPs/s, fwd + bwd: 958.05 TFLOPs/s
Pytorch fwd: 19.11 TFLOPs/s, bwd: 40.84 TFLOPs/s, fwd + bwd: 30.83 TFLOPs/s
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment