forked from MeghanaGudaram/HighPerformanceComputing
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAssignment2_ReadMemory.cpp
More file actions
54 lines (44 loc) · 1.66 KB
/
Assignment2_ReadMemory.cpp
File metadata and controls
54 lines (44 loc) · 1.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#include<stdio.h>
#include<omp.h>
#include<immintrin.h>
#include<iostream>
#include<chrono>
#include<ctime>
using namespace std;
using namespace std::chrono;
__m256 avx_memoryRead(float* array, int size)
{
__m256 sum = _mm256_set1_ps(0);
for (int i = 0; i < size ; i=i+8)
{
sum= _mm256_add_ps(_mm256_load_ps(&array[i]),sum); // Adding elements to sum, impying read from memory
}
sum=_mm256_hadd_ps(sum,sum); // Horizonatal addition of vector values i.e. vect1=vect0+vect0
sum=_mm256_hadd_ps(sum,sum); // Horizonatal addition of vector values i.e. vect2 =vect1+vect1
sum=_mm256_hadd_ps(sum,sum); // Horizonatal addition of vector values i.e. vect3=vect2+vect2
return sum;
}
int main()
{
#pragma omp parallel
{
int size=262144; // 256KB is taken as size
float *array;
int status=posix_memalign((void**) &array, 32 , size*sizeof(float)); // Allocating memory
for(int j=0;j<size;j++) // Initialize array to 1
array[j]=1;
double bw=0;
__m256 val= _mm256_set1_ps(0);
high_resolution_clock::time_point t1 = high_resolution_clock::now();
for(int i=0;i<100;i++)
{
val=avx_memoryRead(array,size); // function call to make sure loops are not optimized
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>> (t2 - t1);
bw = (size*4*100)/(1000000000 * time_span.count()); // Bandwidth is size in bytes / time in seconds i.e. scaled to GB/s
printf(" Time taken for 100 operations of %d bytes : %lf BW = %lf GB/s\n", size*4, time_span.count(), bw);
printf("value %f\n",val[0]);
}
return 0;
}