get_timings <- function(participant_ids) {
	first <- T

	for (sid in participant_ids) {
		filename <- paste('Timings/', sid, '.csv', sep='')
		if (!file.exists(filename)) {
			next
		}
		
		sid_timings <- read.csv2(filename, h=T)
		
		if (first) {
			timings <- sid_timings
			first = F
		} else {
			timings <- merge(timings, sid_timings, all=T)
		}
	}
	
	return(timings)
}

read_waveforms <- function(words) {
	# Structure:
	# [Participant][Sentence][Word]
	amplitudes <- list()
	pitches <- list()
	
	participants <- list()
	
	all_participants <- list()
	all_sentences <- list()
	all_words <- list()
	
	
	i <- 1
	for (p in list.files('Amplitude')) {
		amp_path <- paste('Amplitude/', p, '/', sep='')
		pitch_path <- paste('Pitch/', p, '/', sep='')
		
		amplitudes[[i]] <- list()
		pitches[[i]] <- list()
		
		names(amplitudes)[i] <- p
		names(pitches)[i] <- p
		participants[i] <- 0
		
		# Sentence index associated with the previous word
		previous_word_sentence <- 0
		
		for (wf in list.files(amp_path)) {
			splitted = strsplit(wf, '_')[[1]]
			
			current_sentence <- as.numeric(splitted[1]) + 1
			word_number <- as.numeric(splitted[2])
			word_name <- strsplit(splitted[3], '\\.')[[1]][1]
			
			if (word_name %in% words) {
				participants[i] <- participants[[i]] + 1
				all_participants <- append(all_participants, p)
				all_sentences <- append(all_sentences, current_sentence)				
				all_words <- append(all_words, word_name)
			}
			
			if (current_sentence > previous_word_sentence) {
				amplitudes[[i]][[current_sentence]] <- list()
				pitches[[i]][[current_sentence]] <- list()
				
				previous_word_sentence <- current_sentence
			}
			
			# sometimes a pitch contour cannot be extracted
			pitch_filename <- paste(pitch_path, wf, sep='')
			
			if (file.exists(pitch_filename)) {
				amplitude_waveform <- read.csv2(paste(amp_path, wf, sep=''), h=F)
				pitch_waveform <- read.csv2(pitch_filename, h=F)
				
			} else {
				amplitude_waveform <- NA
				pitch_waveform <- NA
			}
			
			pitches[[i]][[current_sentence]][[word_number + 1]] <- pitch_waveform
			amplitudes[[i]][[current_sentence]][[word_number + 1]] <- amplitude_waveform
			
			names(amplitudes[[i]][[current_sentence]])[word_number + 1] <- word_name
			names(pitches[[i]][[current_sentence]])[word_number + 1] <- word_name
		}
		
		names(participants)[i] <- p
		
		i <- i + 1
	}
	
	all_words_metadata <- data.frame(
		participant=unlist(all_participants), 
		sentence_pos = unlist(all_sentences),
		word_name = unlist(all_words))
		
	list('amplitudes' = amplitudes, 'pitches' = pitches, 'words_found' = participants, 
		'all_words' = all_words_metadata)
}

# Used for test purposes and analysis
convert_waveform2bspline <- function(time_list, y) {
	# Similar to Gubian 2010 velocity.R
	basis_i <- create.bspline.basis(range(time_list), nbasis, bspline_order)
	Lfdobj <- bspline_order - 2 # roughness penalty
	fdPar_i <- fdPar(basis_i, Lfdobj, lambda=lambda_i)
	
	return(smooth.basis(time_list, y, fdPar_i))
}

# Convert to bsplines
convert2bsplines <- function(waveforms, convert_function = NULL, normalize_t = F, words = NULL, excluded = NULL) {
	# Used for determining the mean and st.dev
	converted <- lapply(1:length(waveforms), function(participant_i) {
		# keep track of the missing values
		missing_i <- 0
		missing_sentences <- list()
		missing_word_names <- list()
		missing_words <- list()
		missing_reasons <- list()
		
		participant <- waveforms[participant_i][[1]]
		participant_name <- names(waveforms)[participant_i]
		
		first_y <- T
		
		converted_row <- list()
		
		sentence_i <- 1
			
		for (sentence in participant) {
			converted_row[[sentence_i]] <- list()
			
			# This can be skipped due to missing words
			word_i <- 1
			
			if (length(sentence) > 0) {
				for (actual_word_i in 1:length(sentence)) {
					word_name <- names(sentence)[[actual_word_i]]
					waveform <- sentence[[actual_word_i]]
					
					include <- T
					if (!is.null(words) && !(word_name %in% words)) {
						include <- F
					}
					
					if (!is.null(excluded)) {
						matched <- dim(excluded[excluded$participant == participant_name
							& excluded$sentence_pos == sentence_i
							& excluded$word_name == word_name,])[1]
							
						if (matched == 1) {
							include <- F
						}
					}
					
					if (!include) {
						converted_row[[sentence_i]][[word_i]] <- NA
					} else if (!is.null(waveform) && (length(waveform) > 1 || !is.na(waveform))) {
						# get the y range from the timings
						duration <- timings[timings['participant'] == participant_name & 
							timings['sentence_pos'] == sentence_i &
							timings['word_name'] == word_name,][['duration']]
							
						if (length(duration) == 0) {
							duration_unknown <- T
							duration <- 1
						} else {
							duration_unknown <- F
							duration <- as.numeric(as.character(duration))
						}
						
						# this might be off a bit as the start and end might not exactly 
						# match the duration
						time_list <- as.numeric(as.character(waveform$V1))
						time_list_limited <- time_list > 0 & (duration_unknown | time_list < duration)
												
						# to work around this, the y_left is the average of the first and y_right average 
						# of those last
						t_left <- time_list <= 0
						
						if (!duration_unknown) {
							t_right <- time_list >= duration
						}
						
						time_list <- time_list[time_list_limited]
						
						if (length(time_list) > 1) {
							if (normalize_t) {
								if (duration_unknown) {
									xlim <- c(0, max(time_list))
								} else {
									xlim <- c(0, duration)
								}
								
								# scale to 0-1
								scale_x <- 1 / max(xlim)
								time_list <- time_list * scale_x
								
								if (duration_unknown) {
									# prevent rounding errors
									time_list[length(time_list)] <- 1
								}
							}
							
							y_all <- as.numeric(as.character(waveform$V2))
							y <- y_all[time_list_limited]
							
							# Get the mean of t <= 0 or the first
							y_left <- mean(c(y_all[t_left], y[1]))
							
							if (is.na(y_left)) {
								y_left <- y[1]
							}
							
							# Add the averaged start and end times
							if (duration_unknown) {
								time_list = c(0, time_list)
								y = c(y_left, y)	
							} else {
								# Get the mean of t >= T or the last
								y_right <- mean(c(y_all[t_right], y[length(y)]))
								
								if (is.na(y_right)) {
									y_right <- y[length(y)]
								}
									
								if (normalize_t) {
									time_list <- c(0, time_list, 1)
								} else {
									time_list <- c(0, time_list, duration)
								}
								
								y <- c(y_left, y, y_right)
							}
							
							if (!is.null(convert_function)) {
								# normalization or conversion
								y <- convert_function(participant_i, y)
							}
							
							if (first_y) {
								total_y <- y
								total_xlim <- range(time_list)
								first_y <- F
							}
							else {
								total_y <- c(total_y, y)
								total_xlim <- range(total_xlim, time_list)
							}
						
							# Similar to Gubian 2010 velocity.R
							basis_i <- create.bspline.basis(range(time_list), nbasis, bspline_order)
							Lfdobj <- bspline_order - 2 # roughness penalty
							fdPar_i <- fdPar(basis_i, Lfdobj, lambda=lambda_i)
							
							convertedf <- smooth.basis(time_list, y, fdPar_i)
							
							if (normalize_t) {
								# Assumes z-score and cut of at |Z| > 4
								if (any(abs(eval.fd(1:9/10, convertedf$fd)) > 4)) {
									# remove outliers
									convertedf <- NA
									missing_i <- missing_i + 1
									missing_sentences[missing_i] <- sentence_i
									missing_word_names[missing_i] <- word_name
									missing_words[missing_i] <- actual_word_i
									missing_reasons[missing_i] <- 'outlier'
								}
							}
							
							converted_row[[sentence_i]][[word_i]] <- convertedf
						} else {
							converted_row[[sentence_i]][[word_i]] <- NA
							missing_i <- missing_i + 1
							missing_sentences[missing_i] <- sentence_i
							missing_word_names[missing_i] <- word_name
							missing_words[missing_i] <- actual_word_i
							missing_reasons[missing_i] <- 'outside_bounds'
						}
					} else {
						converted_row[[sentence_i]][[word_i]] <- NA
						missing_i <- missing_i + 1
						missing_sentences[missing_i] <- sentence_i
						missing_word_names[missing_i] <- word_name
						missing_words[missing_i] <- actual_word_i
						missing_reasons[missing_i] <- 'no_data'
					}
					
					word_i <- word_i + 1
				}
			}
			
			names(converted_row[[sentence_i]]) <- names(sentence)
			
			sentence_i <- sentence_i + 1
		}
		
		if (missing_i > 0) {
			missings <- data.frame(
				participant = participant_name,
				sentence_pos = unlist(missing_sentences),
				word_pos = unlist(missing_words),
				word_name = unlist(missing_word_names),
				Exclude = 1,
				missing_reason = unlist(missing_reasons))
		} else {
			missings <- NA
		}
		
		list('waveforms' = converted_row, 't_range' = total_xlim, 'y_range' = range(total_y), 'mean' = mean(total_y), 'sd' = sd(total_y), 'missings' = missings)
	})
	
	names(converted) <- names(waveforms)
	
	return(converted)
}

hz2semitones <- function(y) {
	# semitones relative to 1 Hz
	12 * logb(y, base = 2)
}

# Y Normalization using Z
# T Normalization using a fixed domain of 1
z_normalize <- function(bsplines, semitones = F) {
	return(function(participant_i, y) {
		if (semitones) {
			ys <- hz2semitones(y) 
		} else {
			ys <- y
		}
		
		(ys-bsplines[participant_i][[1]]$mean) / bsplines[participant_i][[1]]$sd
	})
}

flatten_missings <- function(waveforms) {
	first <- T
	for (p in 1:length(waveforms)) {
		participant <- waveforms[[p]]
		participant_name <- names(waveforms)[[p]]
		missings <- participant$missings
		
		if (!is.na(missings)[1]) {
			if (first) {
				participants <- as.vector(missings$participant)
				sentences <- as.vector(missings$sentence_pos)
				words <- as.vector(missings$word_name)
				reasons <- as.vector(missings$missing_reason)
				
				first <- F
			} else {
				participants <- c(participants, as.vector(missings$participant))
				sentences <- c(sentences, as.vector(missings$sentence_pos))
				words <- c(words, as.vector(missings$word_name))
				reasons <- c(reasons, as.vector(missings$missing_reason))
			}
		}
	}
	
	data.frame(participant = participants,
		sentence_pos = sentences,
		word_name = words,
		reason = reasons)
}

# Waveform must be of class fd
find_landmark <- function(waveform) {
	find_candidate <- function(t, offset, recurse_max = 3) {
		# Keep track of the best candidate
		# Keep track of time and y
		# The t and y need to be different from eachother for selecting
		best_candidate <- c(0.3, 99)
		
		for (t in seq(t, t-offset, -offset/1000)) {
			y <- abs(derivf(t))
			
			if (y < best_candidate[2]) {
				# new lower value found
				best_candidate <- c(t, y)
			}
		}
		
		return(best_candidate[1])
	}
	
	deriv <- deriv.fd(waveform)
	derivf <- function(t) eval.fd(t, deriv)
	
	fmean <- mean(eval.fd(knots.fd(waveform), waveform))
	
	candidates_list <- NULL
	candidates <- 0
	previous <- NULL
	
	# maximum number of candidates to choose from
	max_n_candidates <- 5
	
	# after this, the first found is chosen
	border <- 0.3
	
	for (t in seq(0, 1, 1/1000)) {
		if (candidates >= max_n_candidates) {
			break
		}
		else if (t >= border && candidates >= 1) {
			break
		}
		
		now <- derivf(t)
		
		if (!is.null(previous) && ((previous < 0) != (now < 0))) {
			# flip occurred!
			# positive?
			if (eval.fd(t, waveform) > fmean) {
				candidate <- find_candidate(t, 1/1000)
				
				if (is.null(candidates_list)) {
					candidates_list <- candidate
				}
				else {
					candidates_list <- c(candidates_list, candidate)
					candidates <- candidates + 1
				}
			}
		}
		
		previous <- now
	}
	
	# The highest peak is chosen
	yt <- c(candidates_list, eval.fd(candidates_list, waveform))
	yt <- matrix(yt, ncol=2)
	
	return(yt[yt[,2] == max(yt[,2])][1][[1]])
}

mark_landmarks <- function(waveforms, max_sentences = NULL, start_from = 1) {
	result <- lapply(1:length(waveforms), function(participant_i) {
		participant <- waveforms[participant_i][[1]]
		
		landmarks <- list()
		all_landmarks <- NULL
		i <- 1
		for (sentence in participant$waveforms) {
			if (!is.null(max_sentences) && i >= max_sentences + start_from ) {
				break
			}
			
			if (i >= start_from) {
				print(paste('Landmarking participant', participant_i, 'sentence', i))
				
				landmarks[[i-start_from+1]] <- lapply(sentence[names(sentence) %in% words[,'word']], function(word) {
					if (length(word) > 1 || !is.na(word)) {
						
						t <- find_landmark(word$fd)
						print(t)
						return(t)
					} else {
						print("Skipped word.")
						return(NA)
					}
				})
				
				if (is.null(all_landmarks)) {
					all_landmarks <- mapply(c, landmarks[[i-start_from+1]])
				} else {
					all_landmarks <- c(all_landmarks, mapply(c, landmarks[[i-start_from+1]]))
				}
			}
			
			i <- i + 1
		}
		
		return(list('landmarks' = landmarks, 'all_landmarks' = all_landmarks))
	})
	
	landmarks <- lapply(result, function(l) l$landmarks)
	all_landmarks <- unlist(lapply(result, function(l) { l$all_landmarks }))
	
	names(landmarks) <- names(waveforms)
	
	all_numeric_landmarks <- all_landmarks[!is.na(all_landmarks)]
	return(list('landmarks' = landmarks, 'all_landmarks' = all_landmarks, 'mean' = mean(all_numeric_landmarks), 'sd' = sd(all_numeric_landmarks)))
}

# Register landmarks i.e. warp the times of all data!
register_landmarks <- function(landmark_data, waveforms) {
	# warp times
	# Define basis function
	# See: Ramsay 2009, Functional Data Analysis with R and MATLAB, page 122
	wbasisLM <- create.bspline.basis(c(0, 1), 4, 3, c(0, landmark_data$mean, 1))
	WfdLM <- fd(matrix(0, 4, 1), wbasisLM)
	WfdParLM <- fdPar(WfdLM, 1, 1e-12)
	
	registered <- lapply(1:length(landmark_data$landmarks), function(participant_i) {
		participant <- landmark_data$landmarks[[participant_i]]
		participant_name <- names(landmark_data$landmarks)[[participant_i]]
		warped <- list()
		
		i <- 1
		for (sentence_landmarks in participant) {
			print(paste('Registering landmarks participant', participant_i, 'sentence', i))
			
			warped[[i]] <- lapply(names(sentence_landmarks), function(word_name) {
				t <- sentence_landmarks[[word_name]]
				f <- waveforms[[participant_name]]$waveforms[[i]][[word_name]]

				if (length(f) <= 1 && is.na(f)) {
					return(NA)
				} else if (length(t) <= 1 && is.na(t)) {
					return(NA)
				} else {
					return(landmarkreg(f$fd, as.matrix(t), as.matrix(landmark_data$mean), WfdParLM, T))
				}
			})
			
			names(warped[[i]]) <- names(sentence_landmarks)
			i <- i + 1
		}
		
		return(warped)
	})
	
	names(registered) <- names(landmark_data$landmarks)
	return(registered)
}

# Combine the landmarked waveforms into a single fd object
combine_landmarked_waveforms <- function(landmarked, word_selector = NULL, exclude = 0) {
	if (is.null(word_selector)) {
		word_selector <- function(w) T
	}
	
	regfd_list <- list()
	warpfd_list <- list()
	excluded <- 0
	
	i <- 1
	for (participant_i in 1:length(landmarked)) {
		participant <- landmarked[[participant_i]]

		for (sentence_i in 1:length(participant)) {
			sentence <- participant[[sentence_i]]
			
			if (length(sentence) < 1) {
				next
			}

			for (word_i in 1:length(sentence)) {				
				word_name <- names(sentence)[[word_i]]

				if (word_selector(word_name)) {
					word <- sentence[[word_i]]
					
					if(length(word) > 1 || !is.na(word)) {
						if (i %in% exclude) {
							excluded <- excluded + 1
						} else {
							regfd_list[[i - excluded]] <- coef(word$regfd)
							warpfd_list[[i - excluded]] <- coef(word$warpfd)
						}
						
						i <- i + 1
					}
				}
			}
		}
	}
	
	regfd_coefs <- matrix(unlist(regfd_list), 42, i-excluded-1)
	warpfd_coefs <- matrix(unlist(warpfd_list), 42, i-excluded-1)
	
	combined_regfd <- fd(regfd_coefs, basis_i)
	combined_warpfd <- fd(warpfd_coefs, basis_i)
	
	return(list(regfd=combined_regfd, warpfd=combined_warpfd))
}

get_outliers <- function(fdobj, cutoff) {
	outliers <- list()
	for (i in 1:length(fdobj$fd$reps)) {
		if (any(abs(eval.fd(1:99/100, fdobj[i])) > cutoff)) {
			outliers <- append(outliers, i)
		}
	}
	return(unlist(outliers))
}

# Combine the waveforms into a single fd object and get the associated metadata as
# a single data frame
combine_waveforms <- function(waveforms, word_selector = NULL, exclude = 0) {
	coefs_list <- list()
	
	# Additional data columns:
	# participant ID, sentence number, word pos number, word name
	participant_ids <- list()
	sentence_pos <- list()
	word_pos <- list()
	word_names <- list()
	
	excluded <- 0
	excluded_participant_ids <- list()
	excluded_sentence_pos <- list()
	excluded_word_pos <- list()
	excluded_word_names <- list()
	exclude_i <- list()
	
	if (is.null(word_selector)) {
		word_selector <- function(w) w %in% words[['word']]
	}
	
	i <- 1
	
	for (participant_i in 1:length(waveforms)) {
		participant <- waveforms[[participant_i]]
		participant_name <- names(waveforms)[[participant_i]]
		
		for (sentence_i in 1:length(participant$waveforms)) {
			sentence <- participant$waveforms[[sentence_i]]
			current_word_pos <- 1
			
			if (length(sentence) < 1) {
				next
			}
			
			for (word_i in 1:length(sentence)) {
				word_name <- names(sentence)[[word_i]]			
				
				if (!is.null(word_name) && word_selector(word_name)) {
					# Actual word
					word <- sentence[[word_i]]
					
					if (length(word) > 1 || !is.na(word)) {
						word_coefs <- coef(word)
						
						if (i %in% exclude) {
							excluded <- excluded + 1
							
							excluded_participant_ids[excluded] <- participant_name
							excluded_sentence_pos[excluded] <- sentence_i
							excluded_word_pos[excluded] <- current_word_pos
							excluded_word_names[excluded] <- word_name
							exclude_i[excluded] <- i
						} else {
							coefs_list[[i-excluded]] <- word_coefs
							
							participant_ids[i-excluded] <- participant_name
							sentence_pos[i-excluded] <- sentence_i
							word_pos[i-excluded] <- current_word_pos
							word_names[i-excluded] <- word_name
						}
						
						i <- i + 1
					}
					
					current_word_pos <- current_word_pos + 1
				}
			}
		}
	}
	
	coefs <- matrix(unlist(coefs_list), 42, i-excluded-1)
	
	combined_fd <- fd(coefs, basis_i)
	
	metadata <- data.frame(
		participant = unlist(participant_ids),
		sentence_pos = unlist(sentence_pos),
		word_pos = unlist(word_pos),
		word_name = unlist(word_names))
		
	metadata_excluded <- data.frame(
		participant = unlist(excluded_participant_ids),
		sentence_pos = unlist(excluded_sentence_pos),
		word_pos = unlist(excluded_word_pos),
		word_name = unlist(excluded_word_names),
		exclude_i = unlist(exclude_i))
		
	return(list(combined_fd, metadata, coefs, metadata_excluded))
}


# try improving the landmarked waveforms using continuous registration
register_landmarks_continuous  <- function(landmarked_waveforms_combined) {
	simple_basis <- create.bspline.basis(c(0,1), 5, 4)
	register.fd(mean(landmarked_waveforms_combined), landmarked_waveforms_combined, simple_basis, iterlim=10, conv=1e-03)
}

apply_warp <- function(waveforms_combined, landmarked) {
	register.newfd(waveforms_combined, landmarked$warpfd, type='monotone')
}

reconstruct <- function(pca_res, pc1, pc2) {
	pca_res$meanfd + pc1 * pca_res$harmonics[1,] + pc2 * pca_res$harmonics[2,]
}

# intersect two unique data.tables
intersect <- function(x, y) {
	z <- rbind(x, y)
	dupes <- which(duplicated(z))
	z[dupes,]
}

# values not in either data.table
except <- function(x, y) {
	select_list <- list()

	for (i in 1:dim(x)[1]) {
		row <- x[i,]
		exists <- dim(y[y$participant == as.character(row$participant) & y$sentence_pos == row$sentence_pos & y$word_name == as.character(row$word_name), ])[1] > 0
		
		if (!exists) {
			select_list <- append(select_list, i)
		}
	}
	
	x[unlist(select_list)]
}