Program consists of threads of control.
A practical aside...
gcc -fopenmp -c foo.c
gcc -fopenmp -o mycode.x foo.o
icc -openmp -c foo.c
icc -openmp -o mycode.x foo.o
#include <stdio.h>
#include <omp.h>
int main()
{
#pragma omp parallel
printf("Hello world from %d\n",
omp_get_thread_num());
return 0;
}
double s[MAX_THREADS];
int i;
#pragma omp parallel shared(s) private(i)
{
i = omp_get_thread_num();
s[i] = i;
}
#pragma omp parallel {
//...
#pragma omp critical my_data_cs
{
//... modify data structure here ...
}
}
#pragma omp parallel
for (i = 0; i < nsteps; ++i) {
do_stuff();
#pragma omp barrier
}
/* Compute dot of x and y of length n */
int i, tid;
double my_dot, dot = 0;
#pragma omp parallel \
shared(dot,x,y,n) \
private(i,my_dot)
{
tid = omp_get_thread_num();
my_dot = 0;
#pragma omp for
for (i = 0; i < n; ++i)
my_dot += x[i]*y[i];
#pragma omp critical
dot += my_dot;
}
/* Compute dot of x and y of length n */
int i, tid;
double dot = 0;
#pragma omp parallel \
shared(x,y,n) \
private(i) \
reduction(+:dot)
{
#pragma omp for
for (i = 0; i < n; ++i)
dot += x[i]*y[i];
}
Partition index space different ways:
static[(chunk)]
: decide at start of loop; default chunk
is n/nthreads
. Low overhead, potential load imbalance.dynamic[(chunk)]
: each thread takes chunk
iterations when it has time; default chunk
is 1. Higher
overhead, but automatically balances load.guided
: take chunks of size unassigned
iterations/threads; chunks get smaller toward end of loop. Somewhere
between static
and dynamic
.auto
: up to the system!Default behavior is implementation-dependent.
single
: do only in one thread (e.g. I/O)master
: do only in one thread; others skipsections
: like cobegin/coend#pragma omp parallel
{
#pragma omp single
{
// General setup work
#pragma omp task
task1();
#pragma omp task
task2();
#pragma omp taskwait
depends_on_both_tasks();
}
}
Adapted from an SC13 presentation
node_t* p = head;
#pragma omp parallel
{
#pragma omp single nowait
while (p != NULL) {
#pragma omp task firstprivate(p)
do_work(p);
p = p->next;
}
} // Implied barrier at end of parallel region
void traverse(node_t* p)
{
if (p->left)
#pragma omp task
traverse(p->left);
if (p->right)
#pragma omp task
travers(p->right);
#pragma omp taskwait
process(p->data);
}
Fred Brooks (Mythical Man Month) identified two types of software complexity: essential and accidental.
Does OpenMP address accidental complexity? Yes, somewhat!
Essential complexity is harder.
Let’s focus again on memory issues...