diff --git a/README.md b/README.md
index 98dd9a8..87518dd 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,40 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
-Project 1 - Flocking**
+#University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking#
+
+## Xiaomao Ding ##
+* Tested on: Windows 8.1, i7-4700MQ @ 2.40GHz 8.00GB, GT 750M 2047MB (Personal Computer)
+
+## Intro ##
+The code in this repo is part of Project 1 for CIS565 Fall 2016 at UPenn. For this project, I accelerated the [Reynolds Boid algorithm](http://www.red3d.com/cwr/boids/) using a NVIDIA Cuda kernel. There are three different implementations: a brute force method that compares each boid to every other boid, a uniform grid method that divides the space into a grid for neighbor search, and a coherent uniform grid method that reorganizes the position and velocity vectors to represent the uniform grid. See the gif below for an example of the algorithm in action! Each color represents a different flock of boids. 
+
+<div style="text-align:center"><img src ="https://github.com/xnieamo/Project1-CUDA-Flocking/blob/master/images/dt0.2_particles16000.gif" /></div>
+
+Above is a gif generated using the code in this repo with 16000 boids using the coherent grid implementation.
+
+### Quick Note ###
+Before running any of the code in this repo, it is possible that you may have to adjust the compute capability flag in `scr/CMakeLists.txt`. To do so, change the '-arch=sm_30' to match your compute capability. 20 matches to 2.0, 30 to 3.0, etc.
+
+## Performance Analysis ##
+
+### Number of Boids ###
+In order to analyse the performance of our implementation, we will be using the FPS without visualization as a metric. The first thing we would like to know is how well each algorithm performs with the number of boids.
+
+![FPSvNumBoidPlot](https://github.com/xnieamo/Project1-CUDA-Flocking/blob/master/images/PerformanceVBoidNum.png)
+
+In the graph above, it is clear that the brute force method performs the worst. This is because the number of comparisons for each boid increases linearly with the number of boids. The scattered uniform grid performs much better as it drastically reduces the number of comparisons needed for each boid. What's surprising is the dramatic increase in performance for the uniform grid! Even though both the coherent and scattered grid make the same number of comparisons, the difference in performance is similar to that between the scattered grid and the brute force method.  The only change is that we remove the use of an intermediate array for grid indexing. This suggests that reading from memory is a significant bottleneck in our GPU implementations.
+
+### Block Size and Count###
+We might also be interested in the performance of our implementations for varying block sizes on the GPU. Below we see that performance is roughly equivalent for each implementation. This makes sense as increasing the block sizes (and thereby decreasing block count) doesn't necessarily change the number of threads allocated for the whole calculation and each boid has an independent calculation. These graphs were generated using a coherent grid with 16000 boids.
+
+![FPSvBlockSize](https://github.com/xnieamo/Project1-CUDA-Flocking/blob/master/images/PerformanceVBlockSize.png)
+
+### dT ###
+What's somewhat surprising is that changing the time step parameter, dT, also affects performance. As you increase dT, performance increases drastically as shown in the graph below. This is possibly related to the fact that at high dT, all the boids are automatically placed into 1 giant flock. This graph is generated using a coherent grid with 16000 boids.
+
+![FPSvdT](https://github.com/xnieamo/Project1-CUDA-Flocking/blob/master/images/PerformanceVdt.png)
+
+![FastFlock](https://github.com/xnieamo/Project1-CUDA-Flocking/blob/master/images/dt1.6_particles16000.gif)
+
+
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/PerformanceVBlockSize.png b/images/PerformanceVBlockSize.png
new file mode 100644
index 0000000..d3ce046
Binary files /dev/null and b/images/PerformanceVBlockSize.png differ
diff --git a/images/PerformanceVBoidNum.png b/images/PerformanceVBoidNum.png
new file mode 100644
index 0000000..09be664
Binary files /dev/null and b/images/PerformanceVBoidNum.png differ
diff --git a/images/PerformanceVdt.png b/images/PerformanceVdt.png
new file mode 100644
index 0000000..01b3f82
Binary files /dev/null and b/images/PerformanceVdt.png differ
diff --git a/images/dt0.2_particles16000.gif b/images/dt0.2_particles16000.gif
new file mode 100644
index 0000000..71ffae3
Binary files /dev/null and b/images/dt0.2_particles16000.gif differ
diff --git a/images/dt1.6_particles16000.gif b/images/dt1.6_particles16000.gif
new file mode 100644
index 0000000..2c9f165
Binary files /dev/null and b/images/dt1.6_particles16000.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..750f0cb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_30
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 30356b9..ae89a13 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -21,14 +21,14 @@
 * Check for CUDA errors; print and exit if there was a problem.
 */
 void checkCUDAError(const char *msg, int line = -1) {
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    if (line >= 0) {
-      fprintf(stderr, "Line %d: ", line);
-    }
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		if (line >= 0) {
+			fprintf(stderr, "Line %d: ", line);
+		}
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+		exit(EXIT_FAILURE);
+	}
 }
 
 
@@ -49,7 +49,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 #define rule2Scale 0.1f
 #define rule3Scale 0.1f
 
-#define maxSpeed 1.0f
+#define maxSpeed 1.f
 
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
@@ -86,6 +86,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+// Buffer to ping-pong positions
+glm::vec3 *dev_pos2;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -99,13 +102,13 @@ glm::vec3 gridMinimum;
 ******************/
 
 __host__ __device__ unsigned int hash(unsigned int a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+	a = (a + 0x7ed55d16) + (a << 12);
+	a = (a ^ 0xc761c23c) ^ (a >> 19);
+	a = (a + 0x165667b1) + (a << 5);
+	a = (a + 0xd3a2646c) ^ (a << 9);
+	a = (a + 0xfd7046c5) + (a << 3);
+	a = (a ^ 0xb55a4f09) ^ (a >> 16);
+	return a;
 }
 
 /**
@@ -113,10 +116,10 @@ __host__ __device__ unsigned int hash(unsigned int a) {
 * Function for generating a random vec3.
 */
 __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
-  thrust::default_random_engine rng(hash((int)(index * time)));
-  thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+	thrust::default_random_engine rng(hash((int)(index * time)));
+	thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
 
-  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+	return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
 }
 
 /**
@@ -124,52 +127,67 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
 * CUDA kernel for generating boids with a specified mass randomly around the star.
 */
 __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    glm::vec3 rand = generateRandomVec3(time, index);
-    arr[index].x = scale * rand.x;
-    arr[index].y = scale * rand.y;
-    arr[index].z = scale * rand.z;
-  }
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		glm::vec3 rand = generateRandomVec3(time, index);
+		arr[index].x = scale * rand.x;
+		arr[index].y = scale * rand.y;
+		arr[index].z = scale * rand.z;
+	}
 }
 
 /**
 * Initialize memory, update some globals
 */
 void Boids::initSimulation(int N) {
-  numObjects = N;
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  // LOOK-1.2 - This is basic CUDA memory management and error checking.
-  // Don't forget to cudaFree in  Boids::endSimulation.
-  cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
-
-  cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
-  cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
-
-  // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
-  checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
-
-  // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
-  int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
-
-  gridCellCount = gridSideCount * gridSideCount * gridSideCount;
-  gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
-  gridMinimum.x -= halfGridWidth;
-  gridMinimum.y -= halfGridWidth;
-  gridMinimum.z -= halfGridWidth;
-
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
-  cudaThreadSynchronize();
+	numObjects = N;
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+	// LOOK-1.2 - This is basic CUDA memory management and error checking.
+	// Don't forget to cudaFree in  Boids::endSimulation.
+	cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+	cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
+	cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
+
+	// LOOK-1.2 - This is a typical CUDA kernel invocation.
+	kernGenerateRandomPosArray << <fullBlocksPerGrid, blockSize >> >(1, numObjects,
+		dev_pos, scene_scale);
+	checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
+	// LOOK-2.1 computing grid params
+	gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+	int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
+	gridSideCount = 2 * halfSideCount;
+
+	gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+	gridInverseCellWidth = 1.0f / gridCellWidth;
+	float halfGridWidth = gridCellWidth * halfSideCount;
+	gridMinimum.x -= halfGridWidth;
+	gridMinimum.y -= halfGridWidth;
+	gridMinimum.z -= halfGridWidth;
+
+	// TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+	cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+	cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+	cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+	cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+	cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+
+	cudaThreadSynchronize();
 }
 
 
@@ -181,41 +199,41 @@ void Boids::initSimulation(int N) {
 * Copy the boid positions into the VBO so that they can be drawn by OpenGL.
 */
 __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
-  float c_scale = -1.0f / s_scale;
+	float c_scale = -1.0f / s_scale;
 
-  if (index < N) {
-    vbo[4 * index + 0] = pos[index].x * c_scale;
-    vbo[4 * index + 1] = pos[index].y * c_scale;
-    vbo[4 * index + 2] = pos[index].z * c_scale;
-    vbo[4 * index + 3] = 1.0f;
-  }
+	if (index < N) {
+		vbo[4 * index + 0] = pos[index].x * c_scale;
+		vbo[4 * index + 1] = pos[index].y * c_scale;
+		vbo[4 * index + 2] = pos[index].z * c_scale;
+		vbo[4 * index + 3] = 1.0f;
+	}
 }
 
 __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-
-  if (index < N) {
-    vbo[4 * index + 0] = vel[index].x + 0.3f;
-    vbo[4 * index + 1] = vel[index].y + 0.3f;
-    vbo[4 * index + 2] = vel[index].z + 0.3f;
-    vbo[4 * index + 3] = 1.0f;
-  }
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (index < N) {
+		vbo[4 * index + 0] = vel[index].x + 0.3f;
+		vbo[4 * index + 1] = vel[index].y + 0.3f;
+		vbo[4 * index + 2] = vel[index].z + 0.3f;
+		vbo[4 * index + 3] = 1.0f;
+	}
 }
 
 /**
 * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
 */
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
-  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+	kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
+	kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
-  checkCUDAErrorWithLine("copyBoidsToVBO failed!");
+	checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
-  cudaThreadSynchronize();
+	cudaThreadSynchronize();
 }
 
 
@@ -230,10 +248,46 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 thisPos = pos[iSelf];
+	glm::vec3 thisVel = vel[iSelf];
+
+	glm::vec3 centroid(0.0f, 0.0f, 0.0f);
+	int neighborCount = 0;
+
+	glm::vec3 separate(0.0f, 0.0f, 0.0f);
+	glm::vec3 cohesion(0.0f, 0.0f, 0.0f);
+
+
+	for (int x = 0; x < N; x++){
+		if (x == iSelf) continue;
+		float distance = glm::distance(thisPos, pos[x]);
+
+		// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+		if (distance < rule1Distance){
+			centroid += pos[x];
+			neighborCount += 1;
+		}
+
+		// Rule 2: boids try to stay a distance d away from each other
+		if (distance < rule2Distance){
+			separate -= pos[x] - thisPos;
+		}
+
+		// Rule 3: boids try to match the speed of surrounding boids
+		if (distance < rule3Distance){
+			cohesion += vel[x];
+		}
+	}
+
+	if (neighborCount > 0){
+		centroid /= neighborCount;
+		thisVel += (centroid - thisPos) * rule1Scale;
+		thisVel += cohesion * rule3Scale;
+	}
+
+	thisVel += separate * rule2Scale;
+
+	return thisVel;
 }
 
 /**
@@ -241,10 +295,19 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+	glm::vec3 *vel1, glm::vec3 *vel2) {
+	// Compute a new velocity based on pos and vel1
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 newVel = computeVelocityChange(N, index, pos, vel1);
+
+	// Clamp the speed
+	if (glm::length(newVel) > maxSpeed) newVel = newVel * maxSpeed / glm::length(newVel);
+
+	// Record the new velocity into vel2. Question: why NOT vel1?
+	vel2[index] = newVel;
 }
 
 /**
@@ -252,24 +315,24 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
-  // Update position by velocity
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-  if (index >= N) {
-    return;
-  }
-  glm::vec3 thisPos = pos[index];
-  thisPos += vel[index] * dt;
-
-  // Wrap the boids around so we don't lose them
-  thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
-  thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
-  thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
-
-  thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
-  thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
-  thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
-
-  pos[index] = thisPos;
+	// Update position by velocity
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 thisPos = pos[index];
+	thisPos += vel[index] * dt;
+
+	// Wrap the boids around so we don't lose them
+	thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+	thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+	thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+
+	thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+	thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+	thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+
+	pos[index] = thisPos;
 }
 
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
@@ -279,181 +342,513 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //            for(y)
 //             for(z)? Or some other order?
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+	return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+	glm::vec3 gridMin, float inverseCellWidth,
+	glm::vec3 *pos, int *indices, int *gridIndices) {
+	// TODO-2.1
+	// - Label each boid with the index of its grid cell.
+	// - Set up a parallel array of integer indices as pointers to the actual
+	//   boid data in pos and vel1/vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index > N) {
+		return;
+	}
+
+	// Pointer to actual
+	indices[index] = index;
+
+	// Find grid cell pointer
+	glm::vec3 thisPosInCells = (pos[index] - gridMin) * inverseCellWidth;
+	gridIndices[index] = gridIndex3Dto1D((int)thisPosInCells[0], (int)thisPosInCells[1], (int)thisPosInCells[2], gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		intBuffer[index] = value;
+	}
+}
+
+// 2.3 - Need to arrange the pos and vel arrays to make the particle array
+__global__ void kernMakePosAndVelCoherent(int N, int *particleArrayIndices,
+	glm::vec3 *pos1, glm::vec3 *pos2, glm::vec3 *vel1, glm::vec3 *vel2){
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index > N) {
+		return;
+	}
+
+	int oldIndex = particleArrayIndices[index];
+	pos2[index] = pos1[oldIndex];
+	vel2[index] = vel1[oldIndex];
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+	int *gridCellStartIndices, int *gridCellEndIndices) {
+	// TODO-2.1
+	// Identify the start point of each cell in the gridIndices array.
+	// This is basically a parallel unrolling of a loop that goes
+	// "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+
+	// Get the grid index for this thread
+	int thisGridIndex = particleGridIndices[index];
+
+	// If this is the last grid index, it must be an end point. Similarly, if this is
+	// the first grid index, it must be a start point.
+	//if (index == N) {
+	//	gridCellEndIndices[thisGridIndex] = index + 1;
+	//}
+	//else {
+	//	// If it is not the last grid index, check its value against the next grid index.
+	//	int theNextGridIndex = particleGridIndices[index + 1];
+	//	if ((theNextGridIndex - thisGridIndex) > 0){
+	//		gridCellEndIndices[thisGridIndex] = index + 1;
+	//	}
+	//}
+
+	if (index == 0) {
+		gridCellStartIndices[thisGridIndex] = index;
+	}
+	
+	int theNextGridIndex = particleGridIndices[index + 1];
+	if ((theNextGridIndex - thisGridIndex) > 0){
+		gridCellStartIndices[thisGridIndex + 1] = index;
+		gridCellEndIndices[thisGridIndex] = index + 1;
+	}
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridResolution, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	int *particleArrayIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+	// TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+	// the number of boids that need to be checked.
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index > N) {
+		return;
+	}
+
+	// - Identify the grid cell that this particle is in
+	glm::vec3 thisPos = pos[index];
+	glm::vec3 thisPosInCells = (thisPos - gridMin) * inverseCellWidth;
+	int cellX = (int)thisPosInCells[0];
+	int cellY = (int)thisPosInCells[1];
+	int cellZ = (int)thisPosInCells[2];
+	int thisGridCell = gridIndex3Dto1D(cellX, cellY, cellZ, gridResolution);
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	float maxNeighborDistance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance);
+
+	// Use integer flags to determine whether we need to look at +/- cell in X,Y,Z. Do this
+	// by checking the difference in cell index values when adding the max neighbor distance to the
+	// start point. If this results in no change, subtract the distance instead.
+	// -1 means look at minus, 0 means no change, 1 means look at plus
+	//int deltaX = cellX - (int)((thisPos[0] + maxNeighborDistance - gridMin[0]) * inverseCellWidth);
+	//int deltaY = cellY - (int)((thisPos[1] + maxNeighborDistance - gridMin[1]) * inverseCellWidth);
+	//int deltaZ = cellZ - (int)((thisPos[2] + maxNeighborDistance - gridMin[2]) * inverseCellWidth);
+	//if (deltaX == 0) deltaX = cellX - (int)((thisPos[0] - maxNeighborDistance - gridMin[0]) * inverseCellWidth);
+	//if (deltaY == 0) deltaY = cellY - (int)((thisPos[1] - maxNeighborDistance - gridMin[1]) * inverseCellWidth);
+	//if (deltaZ == 0) deltaZ = cellZ - (int)((thisPos[2] - maxNeighborDistance - gridMin[2]) * inverseCellWidth);
+	glm::vec3 shiftedPos = thisPos - (gridMin + cellWidth * glm::vec3(cellX, cellY, cellZ));
+	int deltaX = -1;
+	int deltaY = -1;
+	int deltaZ = -1;
+	if (shiftedPos[0] > cellWidth / 2) deltaX = 1;
+	if (shiftedPos[1] > cellWidth / 2) deltaY = 1;
+	if (shiftedPos[2] > cellWidth / 2) deltaZ = 1;
+
+	// Initialize an length 8 array with -1. We store the possible cells to check here. When looping, we
+	// can as soon as we reach a -1 since that means no new entries are beyond that point.
+	//int cellsToCheck[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+	//int c2cIdx = 0;
+	//for (int x = 0; x < 2; x++){
+	//	for (int y = 0; y < 2; y++){
+	//		for (int z = 0; z < 2; z++){
+	//			cellsToCheck[c2cIdx] = gridIndex3Dto1D(cellX + x * deltaX, cellY + y * deltaY, cellZ + z * deltaZ, gridResolution);
+	//			c2cIdx++;
+	//		}
+	//	}
+	//}
+
+	// - For each cell, read the start/end indices in the boid pointer array.
+	// - Access each boid in the cell and compute velocity change from
+	//   the boids rules, if this boid is within the neighborhood distance.
+	glm::vec3 thisVel = vel1[index];
+
+	int neighborCount = 0;
+	glm::vec3 centroid(0.0f, 0.0f, 0.0f);
+	glm::vec3 separate(0.0f, 0.0f, 0.0f);
+	glm::vec3 cohesion(0.0f, 0.0f, 0.0f);
+
+	for (int x = 0; x < 2; x++){
+		for (int y = 0; y < 2; y++){
+			for (int z = 0; z < 2; z++){
+				int gridCellToCheck = gridIndex3Dto1D(cellX + x * deltaX, cellY + y * deltaY, cellZ + z * deltaZ, gridResolution);
+				//c2cIdx++;
+
+				//for (int x = 0; x < 8; x++){
+				//if (gridCellToCheck == -1) break;
+				if (gridCellToCheck > gridResolution*gridResolution*gridResolution) continue;
+				if (gridCellStartIndices[gridCellToCheck] == -1) continue; // We set all values to -1 beforehand. If it is not changed, it is empty.
+
+				for (int y = gridCellStartIndices[gridCellToCheck]; y < gridCellEndIndices[gridCellToCheck]; y++){
+					int otherBoid = particleArrayIndices[y];
+					float distance = glm::distance(thisPos, pos[otherBoid]);
+
+					// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+					if (distance < rule1Distance){
+						centroid += pos[otherBoid];
+						neighborCount += 1;
+					}
+
+					// Rule 2: boids try to stay a distance d away from each other
+					if (distance < rule2Distance){
+						separate -= pos[otherBoid] - thisPos;
+					}
+
+					// Rule 3: boids try to match the speed of surrounding boids
+					if (distance < rule3Distance){
+						cohesion += vel1[otherBoid];
+					}
+				}
+				//}
+			}
+		}
+	}
+	if (neighborCount > 0){
+		centroid /= neighborCount;
+		thisVel += (centroid - thisPos) * rule1Scale;
+		thisVel += cohesion * rule3Scale;
+	}
+
+	thisVel += separate * rule2Scale;
+
+	// - Clamp the speed change before putting the new speed in vel2
+	if (glm::length(thisVel) > maxSpeed) thisVel = thisVel * maxSpeed / glm::length(thisVel);
+	vel2[index] = thisVel;
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridResolution, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+	// TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+	// except with one less level of indirection.
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index > N) {
+		return;
+	}
+
+	// - Identify the grid cell that this particle is in
+	glm::vec3 thisPos = pos[index];
+	glm::vec3 thisPosInCells = (thisPos - gridMin) * inverseCellWidth;
+	int cellX = (int)thisPosInCells[0];
+	int cellY = (int)thisPosInCells[1];
+	int cellZ = (int)thisPosInCells[2];
+	int thisGridCell = gridIndex3Dto1D(cellX, cellY, cellZ, gridResolution);
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	float maxNeighborDistance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance);
+
+	// Use integer flags to determine whether we need to look at +/- cell in X,Y,Z. Do this
+	// by checking the difference in cell index values when adding the max neighbor distance to the
+	// start point. If this results in no change, subtract the distance instead.
+	// -1 means look at minus, 0 means no change, 1 means look at plus
+	//int deltaX = cellX - (int)((thisPos[0] + maxNeighborDistance - gridMin[0]) * inverseCellWidth);
+	//int deltaY = cellY - (int)((thisPos[1] + maxNeighborDistance - gridMin[1]) * inverseCellWidth);
+	//int deltaZ = cellZ - (int)((thisPos[2] + maxNeighborDistance - gridMin[2]) * inverseCellWidth);
+	//if (deltaX == 0) deltaX = cellX - (int)((thisPos[0] - maxNeighborDistance - gridMin[0]) * inverseCellWidth);
+	//if (deltaY == 0) deltaY = cellY - (int)((thisPos[1] - maxNeighborDistance - gridMin[1]) * inverseCellWidth);
+	//if (deltaZ == 0) deltaZ = cellZ - (int)((thisPos[2] - maxNeighborDistance - gridMin[2]) * inverseCellWidth);
+	glm::vec3 shiftedPos = thisPos - (gridMin + cellWidth * glm::vec3(cellX, cellY, cellZ));
+	int deltaX = -1;
+	int deltaY = -1;
+	int deltaZ = -1;
+	if (shiftedPos[0] > cellWidth / 2) deltaX = 1;
+	if (shiftedPos[1] > cellWidth / 2) deltaY = 1;
+	if (shiftedPos[2] > cellWidth / 2) deltaZ = 1;
+
+	// Initialize an length 8 array with -1. We store the possible cells to check here. When looping, we
+	// can as soon as we reach a -1 since that means no new entries are beyond that point.
+	//int cellsToCheck[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+	//int c2cIdx = 0;
+	//for (int x = 0; x < 2; x++){
+	//	for (int y = 0; y < 2; y++){
+	//		for (int z = 0; z < 2; z++){
+	//			cellsToCheck[c2cIdx] = gridIndex3Dto1D(cellX + x * deltaX, cellY + y * deltaY, cellZ + z * deltaZ, gridResolution);
+	//			c2cIdx++;
+	//		}
+	//	}
+	//}
+
+	// - For each cell, read the start/end indices in the boid pointer array.
+	// - Access each boid in the cell and compute velocity change from
+	//   the boids rules, if this boid is within the neighborhood distance.
+	glm::vec3 thisVel = vel1[index];
+
+	int neighborCount = 0;
+	glm::vec3 centroid(0.0f, 0.0f, 0.0f);
+	glm::vec3 separate(0.0f, 0.0f, 0.0f);
+	glm::vec3 cohesion(0.0f, 0.0f, 0.0f);
+
+	for (int x = 0; x < 2; x++){
+		for (int y = 0; y < 2; y++){
+			for (int z = 0; z < 2; z++){
+				int gridCellToCheck = gridIndex3Dto1D(cellX + x * deltaX, cellY + y * deltaY, cellZ + z * deltaZ, gridResolution);
+				//c2cIdx++;
+
+				//for (int x = 0; x < 8; x++){
+				//if (gridCellToCheck == -1) break;
+				if (gridCellToCheck > gridResolution*gridResolution*gridResolution) continue;
+				if (gridCellStartIndices[gridCellToCheck] == -1) continue; // We set all values to -1 beforehand. If it is not changed, it is empty.
+
+				for (int y = gridCellStartIndices[gridCellToCheck]; y < gridCellEndIndices[gridCellToCheck]; y++){
+					int otherBoid = y;
+					float distance = glm::distance(thisPos, pos[otherBoid]);
+
+					// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+					if (distance < rule1Distance){
+						centroid += pos[otherBoid];
+						neighborCount += 1;
+					}
+
+					// Rule 2: boids try to stay a distance d away from each other
+					if (distance < rule2Distance){
+						separate -= pos[otherBoid] - thisPos;
+					}
+
+					// Rule 3: boids try to match the speed of surrounding boids
+					if (distance < rule3Distance){
+						cohesion += vel1[otherBoid];
+					}
+				}
+				//}
+			}
+		}
+	}
+	if (neighborCount > 0){
+		centroid /= neighborCount;
+		thisVel += (centroid - thisPos) * rule1Scale;
+		thisVel += cohesion * rule3Scale;
+	}
+
+	thisVel += separate * rule2Scale;
+
+	// - Clamp the speed change before putting the new speed in vel2
+	if (glm::length(thisVel) > maxSpeed) thisVel = thisVel * maxSpeed / glm::length(thisVel);
+	vel2[index] = thisVel;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+	// TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	// Update velocity
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, dev_vel1, dev_vel2);
+	// Update position
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+
+	// TODO-1.2 ping-pong the velocity buffers
+	glm::vec3 *temp_vel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp_vel;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+	// TODO-2.1
+	// Uniform Grid Neighbor search using Thrust sort.
+	// In Parallel:
+	// - label each particle with its array index as well as its grid index.
+	//   Use 2x width grids.
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount,
+		gridMinimum, gridInverseCellWidth,
+		dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+	//   are welcome to do a performance comparison.
+	thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+	thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+
+	// Set all values to -1 first for easy empty cell identification
+	dim3 fullBlocksPerGridForCells((gridCellCount + blockSize - 1) / blockSize);
+	kernResetIntBuffer << <fullBlocksPerGridForCells, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+	kernResetIntBuffer << <fullBlocksPerGridForCells, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleGridIndices,
+		dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> >(
+		numObjects, gridSideCount, gridMinimum,
+		gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices,
+		dev_particleArrayIndices,
+		dev_pos, dev_vel1, dev_vel2);
+
+	// - Update positions
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+
+	// - Ping-pong buffers as needed
+	glm::vec3 *temp_vel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp_vel;
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	// TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+	// Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+	// In Parallel:
+	// - Label each particle with its array index as well as its grid index.
+	//   Use 2x width grids
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount,
+		gridMinimum, gridInverseCellWidth,
+		dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+	//   are welcome to do a performance comparison.
+	thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+	thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+	// Set all values to -1 first for easy empty cell identification
+	dim3 fullBlocksPerGridForCells((gridCellCount + blockSize - 1) / blockSize);
+	kernResetIntBuffer << <fullBlocksPerGridForCells, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+	kernResetIntBuffer << <fullBlocksPerGridForCells, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleGridIndices,
+		dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	// - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+	//   the particle data in the simulation array.
+	//   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+	kernMakePosAndVelCoherent << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleArrayIndices, dev_pos, dev_pos2, dev_vel1, dev_vel2);
+
+	glm::vec3 *temp = dev_pos;
+	dev_pos = dev_pos2;
+	dev_pos2 = temp;
+
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> >(
+		numObjects, gridSideCount, gridMinimum,
+		gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices,
+		dev_pos, dev_vel1, dev_vel2);
+
+	// - Update positions
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+
+	// - Ping-pong buffers as needed
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+
 }
 
 void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
-
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+	cudaFree(dev_vel1);
+	cudaFree(dev_vel2);
+	cudaFree(dev_pos);
+
+	// TODO-2.1 TODO-2.3 - Free any additional buffers here.
+	cudaFree(dev_gridCellEndIndices);
+	cudaFree(dev_gridCellStartIndices);
+	cudaFree(dev_particleArrayIndices);
+	cudaFree(dev_particleGridIndices);
+	cudaFree(dev_pos2);
 }
 
 void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
-
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  int *intKeys = new int[N];
-  int *intValues = new int[N];
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // cleanup
-  delete(intKeys);
-  delete(intValues);
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
+	// LOOK-1.2 Feel free to write additional tests here.
+
+	// test unstable sort
+	int *dev_intKeys;
+	int *dev_intValues;
+	int N = 10;
+
+	int *intKeys = new int[N];
+	int *intValues = new int[N];
+
+	intKeys[0] = 0; intValues[0] = 0;
+	intKeys[1] = 1; intValues[1] = 1;
+	intKeys[2] = 0; intValues[2] = 2;
+	intKeys[3] = 3; intValues[3] = 3;
+	intKeys[4] = 0; intValues[4] = 4;
+	intKeys[5] = 2; intValues[5] = 5;
+	intKeys[6] = 2; intValues[6] = 6;
+	intKeys[7] = 0; intValues[7] = 7;
+	intKeys[8] = 5; intValues[8] = 8;
+	intKeys[9] = 6; intValues[9] = 9;
+
+	cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+	cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+	std::cout << "before unstable sort: " << std::endl;
+	for (int i = 0; i < N; i++) {
+		std::cout << "  key: " << intKeys[i];
+		std::cout << " value: " << intValues[i] << std::endl;
+	}
+
+	// How to copy data to the GPU
+	cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice);
+
+	// Wrap device vectors in thrust iterators for use with thrust.
+	thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+	thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+	// LOOK-2.1 Example for using thrust::sort_by_key
+	thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+	// How to copy data back to the CPU side from the GPU
+	cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+	cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+	checkCUDAErrorWithLine("memcpy back failed!");
+
+	std::cout << "after unstable sort: " << std::endl;
+	for (int i = 0; i < N; i++) {
+		std::cout << "  key: " << intKeys[i];
+		std::cout << " value: " << intValues[i] << std::endl;
+	}
+
+	// cleanup
+	delete(intKeys);
+	delete(intValues);
+	cudaFree(dev_intKeys);
+	cudaFree(dev_intValues);
+	checkCUDAErrorWithLine("cudaFree failed!");
+	return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index e416836..b7cfec1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 2000; //5000
 const float DT = 0.2f;
 
 /**