get_timings <- function(participant_ids) { first <- T for (sid in participant_ids) { filename <- paste('Timings/', sid, '.csv', sep='') if (!file.exists(filename)) { next } sid_timings <- read.csv2(filename, h=T) if (first) { timings <- sid_timings first = F } else { timings <- merge(timings, sid_timings, all=T) } } return(timings) } read_waveforms <- function(words) { # Structure: # [Participant][Sentence][Word] amplitudes <- list() pitches <- list() participants <- list() all_participants <- list() all_sentences <- list() all_words <- list() i <- 1 for (p in list.files('Amplitude')) { amp_path <- paste('Amplitude/', p, '/', sep='') pitch_path <- paste('Pitch/', p, '/', sep='') amplitudes[[i]] <- list() pitches[[i]] <- list() names(amplitudes)[i] <- p names(pitches)[i] <- p participants[i] <- 0 # Sentence index associated with the previous word previous_word_sentence <- 0 for (wf in list.files(amp_path)) { splitted = strsplit(wf, '_')[[1]] current_sentence <- as.numeric(splitted[1]) + 1 word_number <- as.numeric(splitted[2]) word_name <- strsplit(splitted[3], '\\.')[[1]][1] if (word_name %in% words) { participants[i] <- participants[[i]] + 1 all_participants <- append(all_participants, p) all_sentences <- append(all_sentences, current_sentence) all_words <- append(all_words, word_name) } if (current_sentence > previous_word_sentence) { amplitudes[[i]][[current_sentence]] <- list() pitches[[i]][[current_sentence]] <- list() previous_word_sentence <- current_sentence } # sometimes a pitch contour cannot be extracted pitch_filename <- paste(pitch_path, wf, sep='') if (file.exists(pitch_filename)) { amplitude_waveform <- read.csv2(paste(amp_path, wf, sep=''), h=F) pitch_waveform <- read.csv2(pitch_filename, h=F) } else { amplitude_waveform <- NA pitch_waveform <- NA } pitches[[i]][[current_sentence]][[word_number + 1]] <- pitch_waveform amplitudes[[i]][[current_sentence]][[word_number + 1]] <- amplitude_waveform names(amplitudes[[i]][[current_sentence]])[word_number + 1] <- word_name names(pitches[[i]][[current_sentence]])[word_number + 1] <- word_name } names(participants)[i] <- p i <- i + 1 } all_words_metadata <- data.frame( participant=unlist(all_participants), sentence_pos = unlist(all_sentences), word_name = unlist(all_words)) list('amplitudes' = amplitudes, 'pitches' = pitches, 'words_found' = participants, 'all_words' = all_words_metadata) } # Used for test purposes and analysis convert_waveform2bspline <- function(time_list, y) { # Similar to Gubian 2010 velocity.R basis_i <- create.bspline.basis(range(time_list), nbasis, bspline_order) Lfdobj <- bspline_order - 2 # roughness penalty fdPar_i <- fdPar(basis_i, Lfdobj, lambda=lambda_i) return(smooth.basis(time_list, y, fdPar_i)) } # Convert to bsplines convert2bsplines <- function(waveforms, convert_function = NULL, normalize_t = F, words = NULL, excluded = NULL) { # Used for determining the mean and st.dev converted <- lapply(1:length(waveforms), function(participant_i) { # keep track of the missing values missing_i <- 0 missing_sentences <- list() missing_word_names <- list() missing_words <- list() missing_reasons <- list() participant <- waveforms[participant_i][[1]] participant_name <- names(waveforms)[participant_i] first_y <- T converted_row <- list() sentence_i <- 1 for (sentence in participant) { converted_row[[sentence_i]] <- list() # This can be skipped due to missing words word_i <- 1 if (length(sentence) > 0) { for (actual_word_i in 1:length(sentence)) { word_name <- names(sentence)[[actual_word_i]] waveform <- sentence[[actual_word_i]] include <- T if (!is.null(words) && !(word_name %in% words)) { include <- F } if (!is.null(excluded)) { matched <- dim(excluded[excluded$participant == participant_name & excluded$sentence_pos == sentence_i & excluded$word_name == word_name,])[1] if (matched == 1) { include <- F } } if (!include) { converted_row[[sentence_i]][[word_i]] <- NA } else if (!is.null(waveform) && (length(waveform) > 1 || !is.na(waveform))) { # get the y range from the timings duration <- timings[timings['participant'] == participant_name & timings['sentence_pos'] == sentence_i & timings['word_name'] == word_name,][['duration']] if (length(duration) == 0) { duration_unknown <- T duration <- 1 } else { duration_unknown <- F duration <- as.numeric(as.character(duration)) } # this might be off a bit as the start and end might not exactly # match the duration time_list <- as.numeric(as.character(waveform$V1)) time_list_limited <- time_list > 0 & (duration_unknown | time_list < duration) # to work around this, the y_left is the average of the first and y_right average # of those last t_left <- time_list <= 0 if (!duration_unknown) { t_right <- time_list >= duration } time_list <- time_list[time_list_limited] if (length(time_list) > 1) { if (normalize_t) { if (duration_unknown) { xlim <- c(0, max(time_list)) } else { xlim <- c(0, duration) } # scale to 0-1 scale_x <- 1 / max(xlim) time_list <- time_list * scale_x if (duration_unknown) { # prevent rounding errors time_list[length(time_list)] <- 1 } } y_all <- as.numeric(as.character(waveform$V2)) y <- y_all[time_list_limited] # Get the mean of t <= 0 or the first y_left <- mean(c(y_all[t_left], y[1])) if (is.na(y_left)) { y_left <- y[1] } # Add the averaged start and end times if (duration_unknown) { time_list = c(0, time_list) y = c(y_left, y) } else { # Get the mean of t >= T or the last y_right <- mean(c(y_all[t_right], y[length(y)])) if (is.na(y_right)) { y_right <- y[length(y)] } if (normalize_t) { time_list <- c(0, time_list, 1) } else { time_list <- c(0, time_list, duration) } y <- c(y_left, y, y_right) } if (!is.null(convert_function)) { # normalization or conversion y <- convert_function(participant_i, y) } if (first_y) { total_y <- y total_xlim <- range(time_list) first_y <- F } else { total_y <- c(total_y, y) total_xlim <- range(total_xlim, time_list) } # Similar to Gubian 2010 velocity.R basis_i <- create.bspline.basis(range(time_list), nbasis, bspline_order) Lfdobj <- bspline_order - 2 # roughness penalty fdPar_i <- fdPar(basis_i, Lfdobj, lambda=lambda_i) convertedf <- smooth.basis(time_list, y, fdPar_i) if (normalize_t) { # Assumes z-score and cut of at |Z| > 4 if (any(abs(eval.fd(1:9/10, convertedf$fd)) > 4)) { # remove outliers convertedf <- NA missing_i <- missing_i + 1 missing_sentences[missing_i] <- sentence_i missing_word_names[missing_i] <- word_name missing_words[missing_i] <- actual_word_i missing_reasons[missing_i] <- 'outlier' } } converted_row[[sentence_i]][[word_i]] <- convertedf } else { converted_row[[sentence_i]][[word_i]] <- NA missing_i <- missing_i + 1 missing_sentences[missing_i] <- sentence_i missing_word_names[missing_i] <- word_name missing_words[missing_i] <- actual_word_i missing_reasons[missing_i] <- 'outside_bounds' } } else { converted_row[[sentence_i]][[word_i]] <- NA missing_i <- missing_i + 1 missing_sentences[missing_i] <- sentence_i missing_word_names[missing_i] <- word_name missing_words[missing_i] <- actual_word_i missing_reasons[missing_i] <- 'no_data' } word_i <- word_i + 1 } } names(converted_row[[sentence_i]]) <- names(sentence) sentence_i <- sentence_i + 1 } if (missing_i > 0) { missings <- data.frame( participant = participant_name, sentence_pos = unlist(missing_sentences), word_pos = unlist(missing_words), word_name = unlist(missing_word_names), Exclude = 1, missing_reason = unlist(missing_reasons)) } else { missings <- NA } list('waveforms' = converted_row, 't_range' = total_xlim, 'y_range' = range(total_y), 'mean' = mean(total_y), 'sd' = sd(total_y), 'missings' = missings) }) names(converted) <- names(waveforms) return(converted) } hz2semitones <- function(y) { # semitones relative to 1 Hz 12 * logb(y, base = 2) } # Y Normalization using Z # T Normalization using a fixed domain of 1 z_normalize <- function(bsplines, semitones = F) { return(function(participant_i, y) { if (semitones) { ys <- hz2semitones(y) } else { ys <- y } (ys-bsplines[participant_i][[1]]$mean) / bsplines[participant_i][[1]]$sd }) } flatten_missings <- function(waveforms) { first <- T for (p in 1:length(waveforms)) { participant <- waveforms[[p]] participant_name <- names(waveforms)[[p]] missings <- participant$missings if (!is.na(missings)[1]) { if (first) { participants <- as.vector(missings$participant) sentences <- as.vector(missings$sentence_pos) words <- as.vector(missings$word_name) reasons <- as.vector(missings$missing_reason) first <- F } else { participants <- c(participants, as.vector(missings$participant)) sentences <- c(sentences, as.vector(missings$sentence_pos)) words <- c(words, as.vector(missings$word_name)) reasons <- c(reasons, as.vector(missings$missing_reason)) } } } data.frame(participant = participants, sentence_pos = sentences, word_name = words, reason = reasons) } # Waveform must be of class fd find_landmark <- function(waveform) { find_candidate <- function(t, offset, recurse_max = 3) { # Keep track of the best candidate # Keep track of time and y # The t and y need to be different from eachother for selecting best_candidate <- c(0.3, 99) for (t in seq(t, t-offset, -offset/1000)) { y <- abs(derivf(t)) if (y < best_candidate[2]) { # new lower value found best_candidate <- c(t, y) } } return(best_candidate[1]) } deriv <- deriv.fd(waveform) derivf <- function(t) eval.fd(t, deriv) fmean <- mean(eval.fd(knots.fd(waveform), waveform)) candidates_list <- NULL candidates <- 0 previous <- NULL # maximum number of candidates to choose from max_n_candidates <- 5 # after this, the first found is chosen border <- 0.3 for (t in seq(0, 1, 1/1000)) { if (candidates >= max_n_candidates) { break } else if (t >= border && candidates >= 1) { break } now <- derivf(t) if (!is.null(previous) && ((previous < 0) != (now < 0))) { # flip occurred! # positive? if (eval.fd(t, waveform) > fmean) { candidate <- find_candidate(t, 1/1000) if (is.null(candidates_list)) { candidates_list <- candidate } else { candidates_list <- c(candidates_list, candidate) candidates <- candidates + 1 } } } previous <- now } # The highest peak is chosen yt <- c(candidates_list, eval.fd(candidates_list, waveform)) yt <- matrix(yt, ncol=2) return(yt[yt[,2] == max(yt[,2])][1][[1]]) } mark_landmarks <- function(waveforms, max_sentences = NULL, start_from = 1) { result <- lapply(1:length(waveforms), function(participant_i) { participant <- waveforms[participant_i][[1]] landmarks <- list() all_landmarks <- NULL i <- 1 for (sentence in participant$waveforms) { if (!is.null(max_sentences) && i >= max_sentences + start_from ) { break } if (i >= start_from) { print(paste('Landmarking participant', participant_i, 'sentence', i)) landmarks[[i-start_from+1]] <- lapply(sentence[names(sentence) %in% words[,'word']], function(word) { if (length(word) > 1 || !is.na(word)) { t <- find_landmark(word$fd) print(t) return(t) } else { print("Skipped word.") return(NA) } }) if (is.null(all_landmarks)) { all_landmarks <- mapply(c, landmarks[[i-start_from+1]]) } else { all_landmarks <- c(all_landmarks, mapply(c, landmarks[[i-start_from+1]])) } } i <- i + 1 } return(list('landmarks' = landmarks, 'all_landmarks' = all_landmarks)) }) landmarks <- lapply(result, function(l) l$landmarks) all_landmarks <- unlist(lapply(result, function(l) { l$all_landmarks })) names(landmarks) <- names(waveforms) all_numeric_landmarks <- all_landmarks[!is.na(all_landmarks)] return(list('landmarks' = landmarks, 'all_landmarks' = all_landmarks, 'mean' = mean(all_numeric_landmarks), 'sd' = sd(all_numeric_landmarks))) } # Register landmarks i.e. warp the times of all data! register_landmarks <- function(landmark_data, waveforms) { # warp times # Define basis function # See: Ramsay 2009, Functional Data Analysis with R and MATLAB, page 122 wbasisLM <- create.bspline.basis(c(0, 1), 4, 3, c(0, landmark_data$mean, 1)) WfdLM <- fd(matrix(0, 4, 1), wbasisLM) WfdParLM <- fdPar(WfdLM, 1, 1e-12) registered <- lapply(1:length(landmark_data$landmarks), function(participant_i) { participant <- landmark_data$landmarks[[participant_i]] participant_name <- names(landmark_data$landmarks)[[participant_i]] warped <- list() i <- 1 for (sentence_landmarks in participant) { print(paste('Registering landmarks participant', participant_i, 'sentence', i)) warped[[i]] <- lapply(names(sentence_landmarks), function(word_name) { t <- sentence_landmarks[[word_name]] f <- waveforms[[participant_name]]$waveforms[[i]][[word_name]] if (length(f) <= 1 && is.na(f)) { return(NA) } else if (length(t) <= 1 && is.na(t)) { return(NA) } else { return(landmarkreg(f$fd, as.matrix(t), as.matrix(landmark_data$mean), WfdParLM, T)) } }) names(warped[[i]]) <- names(sentence_landmarks) i <- i + 1 } return(warped) }) names(registered) <- names(landmark_data$landmarks) return(registered) } # Combine the landmarked waveforms into a single fd object combine_landmarked_waveforms <- function(landmarked, word_selector = NULL, exclude = 0) { if (is.null(word_selector)) { word_selector <- function(w) T } regfd_list <- list() warpfd_list <- list() excluded <- 0 i <- 1 for (participant_i in 1:length(landmarked)) { participant <- landmarked[[participant_i]] for (sentence_i in 1:length(participant)) { sentence <- participant[[sentence_i]] if (length(sentence) < 1) { next } for (word_i in 1:length(sentence)) { word_name <- names(sentence)[[word_i]] if (word_selector(word_name)) { word <- sentence[[word_i]] if(length(word) > 1 || !is.na(word)) { if (i %in% exclude) { excluded <- excluded + 1 } else { regfd_list[[i - excluded]] <- coef(word$regfd) warpfd_list[[i - excluded]] <- coef(word$warpfd) } i <- i + 1 } } } } } regfd_coefs <- matrix(unlist(regfd_list), 42, i-excluded-1) warpfd_coefs <- matrix(unlist(warpfd_list), 42, i-excluded-1) combined_regfd <- fd(regfd_coefs, basis_i) combined_warpfd <- fd(warpfd_coefs, basis_i) return(list(regfd=combined_regfd, warpfd=combined_warpfd)) } get_outliers <- function(fdobj, cutoff) { outliers <- list() for (i in 1:length(fdobj$fd$reps)) { if (any(abs(eval.fd(1:99/100, fdobj[i])) > cutoff)) { outliers <- append(outliers, i) } } return(unlist(outliers)) } # Combine the waveforms into a single fd object and get the associated metadata as # a single data frame combine_waveforms <- function(waveforms, word_selector = NULL, exclude = 0) { coefs_list <- list() # Additional data columns: # participant ID, sentence number, word pos number, word name participant_ids <- list() sentence_pos <- list() word_pos <- list() word_names <- list() excluded <- 0 excluded_participant_ids <- list() excluded_sentence_pos <- list() excluded_word_pos <- list() excluded_word_names <- list() exclude_i <- list() if (is.null(word_selector)) { word_selector <- function(w) w %in% words[['word']] } i <- 1 for (participant_i in 1:length(waveforms)) { participant <- waveforms[[participant_i]] participant_name <- names(waveforms)[[participant_i]] for (sentence_i in 1:length(participant$waveforms)) { sentence <- participant$waveforms[[sentence_i]] current_word_pos <- 1 if (length(sentence) < 1) { next } for (word_i in 1:length(sentence)) { word_name <- names(sentence)[[word_i]] if (!is.null(word_name) && word_selector(word_name)) { # Actual word word <- sentence[[word_i]] if (length(word) > 1 || !is.na(word)) { word_coefs <- coef(word) if (i %in% exclude) { excluded <- excluded + 1 excluded_participant_ids[excluded] <- participant_name excluded_sentence_pos[excluded] <- sentence_i excluded_word_pos[excluded] <- current_word_pos excluded_word_names[excluded] <- word_name exclude_i[excluded] <- i } else { coefs_list[[i-excluded]] <- word_coefs participant_ids[i-excluded] <- participant_name sentence_pos[i-excluded] <- sentence_i word_pos[i-excluded] <- current_word_pos word_names[i-excluded] <- word_name } i <- i + 1 } current_word_pos <- current_word_pos + 1 } } } } coefs <- matrix(unlist(coefs_list), 42, i-excluded-1) combined_fd <- fd(coefs, basis_i) metadata <- data.frame( participant = unlist(participant_ids), sentence_pos = unlist(sentence_pos), word_pos = unlist(word_pos), word_name = unlist(word_names)) metadata_excluded <- data.frame( participant = unlist(excluded_participant_ids), sentence_pos = unlist(excluded_sentence_pos), word_pos = unlist(excluded_word_pos), word_name = unlist(excluded_word_names), exclude_i = unlist(exclude_i)) return(list(combined_fd, metadata, coefs, metadata_excluded)) } # try improving the landmarked waveforms using continuous registration register_landmarks_continuous <- function(landmarked_waveforms_combined) { simple_basis <- create.bspline.basis(c(0,1), 5, 4) register.fd(mean(landmarked_waveforms_combined), landmarked_waveforms_combined, simple_basis, iterlim=10, conv=1e-03) } apply_warp <- function(waveforms_combined, landmarked) { register.newfd(waveforms_combined, landmarked$warpfd, type='monotone') } reconstruct <- function(pca_res, pc1, pc2) { pca_res$meanfd + pc1 * pca_res$harmonics[1,] + pc2 * pca_res$harmonics[2,] } # intersect two unique data.tables intersect <- function(x, y) { z <- rbind(x, y) dupes <- which(duplicated(z)) z[dupes,] } # values not in either data.table except <- function(x, y) { select_list <- list() for (i in 1:dim(x)[1]) { row <- x[i,] exists <- dim(y[y$participant == as.character(row$participant) & y$sentence_pos == row$sentence_pos & y$word_name == as.character(row$word_name), ])[1] > 0 if (!exists) { select_list <- append(select_list, i) } } x[unlist(select_list)] }