diff options
Diffstat (limited to 'OLD/csci5451')
-rw-r--r-- | OLD/csci5451/ass1.tex | 139 | ||||
-rw-r--r-- | OLD/csci5451/ass1p6.c | 103 |
2 files changed, 242 insertions, 0 deletions
diff --git a/OLD/csci5451/ass1.tex b/OLD/csci5451/ass1.tex new file mode 100644 index 0000000..24f2e0c --- /dev/null +++ b/OLD/csci5451/ass1.tex @@ -0,0 +1,139 @@ +\documentclass[12pt]{article} +\usepackage{fullpage}\usepackage{listings} +\title{Assignment 1} +\author{Andrea Smith (smit9523@umn.edu), Matt Strapp (strap012@umn.edu)} +\date{2021-09-18} + +\begin{document} +\maketitle + \section{Question 1} + \subsection*{Overlapping Intervals} + The time for one message to be sent 1 hop is \(t_s+t_w*m/k\). After the first message is sent forward one hop, we account for the final message \((d-1)*t_w*m/k\), making the final expression + \[k(t_s+t_w+m/k)+((d-1)t_w*m/k)\] + \subsection*{Non-Overlapping Intervals} + \[t_{transfer} = t_s*k+t_w*d*m\] + \subsection*{} + For both cases, as \(k\) goes to \(m\), the time to transfer will increase greatly. If \(t_s\) is very large, the optimal value of \(k\) is 1. In other words, it is better to transfer the message all at once instead of in \(k\) parts. If \(t_s\) is 0 it has little to no effect on overall transmission time regardless of \(k\) being large or small. + \section{Question 2} + \subsection*{A} + Shared memory has all of the processors access one large pool of memory while distributed memory has each processor have a section of the memory. + \subsection*{B} + Distributed memory programs communicate via pipes and message queues while shared memory programs share memory with locks in place to prevent unintended behavior and race conditions. + \subsection*{C} + Shared memory is in most devices nowadays with even phones having multiple cores and a single bank of shared memory. Large rendering farms and HPC will have distributed memory to split rendering or to maximize performance. + \subsection*{D} + Distributed is easier to scale because the cluster can have nodes added or removed easily without affecting the other nodes. + \section{Question 3} + \subsection*{A}\ + \noindent Concurrency: 8\\ + Critical path length: 4\\ + Maximum achievable speedup: \(15/4\) \\ + Minimum number of processes needed: 8\\ + Maximum achievable speedup if the number of processes is limited to: + \begin{itemize} + \item[2]-- \(15/8\) + \item[4]-- \(15/5\) + \item[8]-- \(15/4\) + \end{itemize} + \subsection*{B} + Concurrency: 8\\ + Critical path length: \\ + Maximum achievable speedup: \\ + Minimum number of processes needed: \\ + Maximum achievable speedup if the number of processes is limited to: + \begin{itemize} + \item[2]-- \(15/8\) + \item[4]-- \(15/5\) + \item[8]-- \(15/4\) + \end{itemize} + \subsection*{C} + Concurrency: 8\\ + Critical path length: 7\\ + Maximum achievable speedup: \(14/7\)\\ + Minimum number of processes needed: \\ + Maximum achievable speedup if the number of processes is limited to: + \begin{itemize} + \item[2]-- \(14/10\) + \item[4]-- \(14/8\) + \item[8]-- \(14/7\) + \end{itemize} + \subsection*{D} + Concurrency: 2\\ + Critical path length: 8\\ + Maximum achievable speedup: \(15/8\)\\ + Minimum number of processes needed: 2\\ + Maximum achievable speedup if the number of processes is limited to: + \begin{itemize} + \item[2]-- \(15/8\) + \item[4]-- \(15/8\) + \item[8]-- \(15/8\) + \end{itemize} + + \section{Question 4} + \subsection*{A} + A[] is partitioned into uniform sub arrays that are of size N/P. Each process counts the frequency of every integer in the range R in its subarray. Each process then forwards its result to the next process, summing each integer count in its appropriate index until H[] is complete. + \subsection*{B} + Each process gets a subsection of the range of size R/P and all of A[]. Each process counts the frequency of the integers in its subsection. At the end each process forwards its result to the next process until the final process contains the array of counts H[]. + \subsection*{C} + Input Partitioning: + \begin{itemize} + \item Requires less memory bc each proc gets N/P + \item Almost always faster than output partitioning + \item IF \(R>>N\), could waste time looking for R values that aren’t even in A + \end{itemize} + Output Partitioning: + \begin{itemize} + \item Requires more memory because every process gets all of A[], potentially impossible depending on memory size. + \item Almost always slower unless \(R>>N\),, where input partitioning would waste time + \end{itemize} + \section{Question 5} + Propagate message to all of column 1: + \begin{displaymath} + t_{single}=t_s+t_w*m*C/2 + \end{displaymath} + \begin{displaymath} + t_{column}=\sum_{i=0}^{\log_2(C)-1} t_s+t_w*m*d + \end{displaymath} + After message propagates down the column, do the same for the rows in parallel. + \begin{displaymath} + t_{row}=\sum_{i=0}^{\log_2(R)-1} t_s+t_w*m*d + \end{displaymath} + The total time is the sum of the time along the rows and the columns. + \begin{displaymath} + t_{comm}=t_{column}+t_{row} + \end{displaymath} + + Pseudo code: + \begin{lstlisting}[language=C]] + //Assume first processor with message is (0,0) + for(int c=0;c<(C/2);c++) { + forward_message(0,c,msg); + } + int i=0, j=0; + //Column loop, parallelized between both messages + while(true) { + if (message is not in node) { + send_message(i,j++,msg); + } else { + break; + } + } + //Parallelized among all rows + for(int r=0;r<(r/2);r++) { + forward_message(r,0,msg); + } + //Row loop, parallelized between all rows with messages + while(true) { + if (message is not in node) { + send_message(i++,j,msg); + } else { + break; + } + } + \end{lstlisting} + \section{Question 6} + \subsection*{A} + Loops: One way to parallelize the heat problem is to compute the inner loop that computes H[t+1] values simultaneously. It would not be feasible to select the outer, doubly nested loop because t increments with every outer loop. You cannot parallelize a process that relies on the previous time. The other loops either display the chart or set the problem up which needs to be sequential. + \subsection*{B} + Let each processor have a single chunk of the rod, with the processors lined up so that the chunks are in a straight line shaped like the rod. Each node only needs to communicate with nodes directly adjacent to it as time progresses, minimizing the needed communication. +\end{document}
\ No newline at end of file diff --git a/OLD/csci5451/ass1p6.c b/OLD/csci5451/ass1p6.c new file mode 100644 index 0000000..d87eddd --- /dev/null +++ b/OLD/csci5451/ass1p6.c @@ -0,0 +1,103 @@ +#include <stdio.h> +#include <stdlib.h> +// HEAT TRANSFER SIMULATION +// +// Simple physical simulation of a rod connected at the left and right +// ends to constant temperature heat/cold sources. All positions on +// the rod are set to an initial temperature. Each time step, that +// temperature is altered by computing the difference between a cells +// temperature and its left and right neighbors. A constant k +// (thermal conductivity) adjusts these differences before altering +// the heat at a cell. Use the following model to compute the heat +// for a position on the rod according to the finite difference +// method. +// +// left_diff = H[t][p] - H[t][p-1]; +// right_diff = H[t][p] - H[t][p+1]; +// delta = -k*( left_diff + right_diff ) +// H[t+1][p] = H[t][p] + delta +// +// Substituting the above, one can get the following +// +// H[t+1][p] = H[t][p] + k*H[t][p-1] - 2*k*H[t][p] + k*H[t][p+1] +// +// The matrix H is computed for all time steps and all positions on +// the rod and displayed after running the simulation. The simulation +// is run for a fixed number of time steps rather than until +// temperatures reach steady state. + +int main(int argc, char **argv) +{ + int max_time = 50; // Number of time steps to simulate + int width = 20; // Number of cells in the rod + double initial_temp = 50.0; // Initial temp of internal cells + double L_bound_temp = 20.0; // Constant temp at Left end of rod + double R_bound_temp = 80.0; // Constant temp at Right end of rod + double k = 0.5; // thermal conductivity constant + double **H; // 2D array of temps at times/locations + + // Allocate memory + H = malloc(sizeof(double *) * max_time); + int t, p; + for (t = 0; t < max_time; t++) + { + H[t] = malloc(sizeof(double *) * width); + } + + // Initialize constant left/right boundary temperatures + for (t = 0; t < max_time; t++) + { + H[t][0] = L_bound_temp; + H[t][width - 1] = R_bound_temp; + } + + // Initialize temperatures at time 0 + t = 0; + for (p = 1; p < width - 1; p++) + { + H[t][p] = initial_temp; + } + + // Simulate the temperature changes for internal cells + for (t = 0; t < max_time - 1; t++) + { + for (p = 1; p < width - 1; p++) + { + double left_diff = H[t][p] - H[t][p - 1]; + double right_diff = H[t][p] - H[t][p + 1]; + double delta = -k * (left_diff + right_diff); + H[t + 1][p] = H[t][p] + delta; + } + } + + // Print results + printf("Temperature results for 1D rod\n"); + printf("Time step increases going down rows\n"); + printf("Position on rod changes going accross columns\n"); + + // Column headers + printf("%3s| ", ""); + for (p = 0; p < width; p++) + { + printf("%5d ", p); + } + printf("\n"); + printf("%3s+-", "---"); + for (p = 0; p < width; p++) + { + printf("------"); + } + printf("\n"); + // Row headers and data + for (t = 0; t < max_time; t++) + { + printf("%3d| ", t); + for (p = 0; p < width; p++) + { + printf("%5.1f ", H[t][p]); + } + printf("\n"); + } + + return 0; +} |