-
Notifications
You must be signed in to change notification settings - Fork 0
/
elp.R
48 lines (44 loc) · 1.45 KB
/
elp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env Rscript
# Kyle Gorman <[email protected]>
#
# Setup for experiments matching regular and irregular verbs on:
#
# * log wordform frequency
# * log lemma frequency
# * suffix conditional probability
# * OLD20
# * squared orthographic word length
# * number of syllables
#
# Thanks to Constantine Lignos for help with the ELP and SUBTLEXus data.
# note that these libraries are not necessary
# for the operations in this file; they could be moved to match...R
# suppressPackageStartupMessages(library(doMC))
# suppressPackageStartupMessages(library(ldamatch))
# suppressPackageStartupMessages(library(plyr))
ELP_WORDS_MERGED <- "elp_words_merged.csv"
ENGLISH_IRREGULARS <- "english_irregulars.csv"
# Creates vector of case-folded irregular forms.
get_irregulars <- function() {
d <- read.csv(ENGLISH_IRREGULARS)
factor(casefold(unique(with(d, c(as.character(VBD), as.character(VBN))))))
}
# Creates dataframe with all ELP variables.
get_elp <- function() {
irregulars <- get_irregulars()
d <- subset(read.csv(ELP_WORDS_MERGED))
# Possibly a past tense verb.
d <- subset(d, suffix == "ed" | word %in% irregulars)
d <- with(d, data.frame(
word = casefold(word),
root = casefold(root),
regularity = as.factor(
ifelse(word %in% irregulars, "irregular", "regular")),
sbtlx.freq,
sbtlx.basefreq,
sbtlx.pformbase,
OLD,
length.squared = length * length,
n.syll))
d[complete.cases(d), ]
}