Analyzing CHILDES with NLTK
Setting up
svn checkout http://nltk.googlecode.com/svn/trunk/ ~/ANY_PATH/nltk-read-only
# note thre is an 'nltk' dir inside of another 'nltk' dir cd ~/ANY_PATH/nltk-read-only/nltk python >>> import nltk # see if you are reading the correct version of the NLTK package >>> nltk.__version__ >>> nltk.__file__
Updates
Testing your setup

name@home:~/$ python
Python 2.6.1 (r261:67515, Jun 24 2010, 21:47:49)
[GCC 4.2.1 (Apple Inc. build 5646)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> nltk.corpus.reader.childes.demo()
Reading bates amy20 .....
words: ["what's", 'that', 'yyy', "it's", 'a', 'chicken', 'yeah'] ...
words with replaced words: ["what's", 'that', 'yyy', "it's", 'a', 'chicken', 'yeah'] ...
words with pos tags: [("what's", 'pro:wh'), ('that', 'pro:dem'), ('yyy', 'unk'), ("it's", 'pro:cop'), ('a', 'det'), ('chicken', 'n'), ('yeah', 'co')] ...
words (only MOT): ["what's", 'that', "it's", 'a', 'chicken', 'yeah', "what's"] ...
words (only CHI): ['yyy', 'yeah', 'yyy', 'yeah', 'xxx', 'woof', 'hat'] ...
stemmed words: ['what', 'be', 'that', 'yyy', 'it', 'be', 'a'] ...
words with relations and pos-tag: [[('what', 'pro:wh', '1|2|PRED'), ('be', 'v', '2|0|ROOT'), ('that', 'pro:dem', '3|2|SUBJ')], [('yyy', 'unk', '1|0|ROOT')], [('it', 'pro:cop', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('a', 'det', '3|4|DET'), ('chicken', 'n', '4|2|PRED')], [('yeah', 'co', '1|0|ROOT')], [('yeah', 'co', '1|0|ROOT')]] ...
sentence: [["what's", 'that'], ['yyy']] ...
participant CHI group : normal
participant CHI language : eng
participant CHI age : P1Y8M
participant CHI sex : female
participant CHI role : Child
participant CHI id : CHI
participant MOT role : Mother
participant MOT id : MOT
participant MOT language : eng
num of sent: 170
num of morphemes: 296
age: ['P1Y8M']
age in month: [20]
MLU: [1.1428571428571428]
Reading bates betty20 .....
words: ['want', 'to', 'stack', 'them', 'for', 'me', 'yyy'] ...
words with replaced words: ['want', 'to', 'stack', 'them', 'for', 'me', 'yyy'] ...
words with pos tags: [('want', 'v'), ('to', 'inf'), ('stack', 'v'), ('them', 'pro'), ('for', 'prep'), ('me', 'pro'), ('yyy', 'unk')] ...
words (only MOT): ['want', 'to', 'stack', 'them', 'for', 'me', 'oh'] ...
words (only CHI): ['yyy', 'that', 'baby', 'yyy', 'yyy', 'uh', 'xxx'] ...
stemmed words: ['want', 'to', 'stack', 'them', 'for', 'me', 'yyy'] ...
words with relations and pos-tag: [[('want', 'v', '1|0|ROOT'), ('to', 'inf', '2|3|INF'), ('stack', 'v', '3|1|XCOMP'), ('them', 'pro', '4|3|OBJ'), ('for', 'prep', '5|3|JCT'), ('me', 'pro', '6|5|POBJ')], [], [], [('yyy', 'unk', '1|0|ROOT')], []] ...
sentence: [['want', 'to', 'stack', 'them', 'for', 'me'], []] ...
participant CHI group : normal
participant CHI name : Betty
participant CHI language : eng
participant CHI age : P1Y8M
participant CHI sex : female
participant CHI role : Target_Child
participant CHI id : CHI
participant MOT role : Mother
participant MOT id : MOT
participant MOT language : eng
num of sent: 148
num of morphemes: 406
age: ['P1Y8M']
age in month: [20]
MLU: [1.1428571428571428]
...
Basic usage
name@home:~/$ python
Python 2.6.1 (r261:67515, Jun 24 2010, 21:47:49)
[GCC 4.2.1 (Apple Inc. build 5646)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> from nltk.corpus.reader import CHILDESCorpusReader
>>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA/')
>>> valian = CHILDESCorpusReader(corpus_root, u'Valian/.*.xml')
>>> valian.fileids()[1:10]
['Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml', 'Valian/03a.xml', 'Valian/03b.xml', 'Valian/04a.xml'...
>>> len(valian.fileids())
43
>>> corpus_data = valian.corpus(valian.fileids())
>>> print corpus_data[0]
{'Lang': 'eng', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': ...
>>> for key in corpus_data[0].keys():
... print key, ": ", corpus_data[0][key]
Lang : eng
{http://www.w3.org/2001/XMLSchema-instance}schemaLocation : ...
Version : 1.5.6
Date : 1986-03-04
Corpus : valian
Id : 01a
>>> corpus_participants = valian.participants(valian.fileids())
>>> for this_corpus_participants in corpus_participants[:2]:
... for key in this_corpus_participants.keys():
... print key, ": ", this_corpus_participants[key]
CHI : defaultdict(, {'group': 'normal'...
INV : defaultdict(, {'role': 'Investigator'...
MOT : defaultdict(, {'role': 'Mother',...
>>> valian.words('Valian/01a.xml')
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
>>> valian.words(valian.fileids()[0])
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
# this will process all files in the Valian corpus
>>> valian.words(valian.fileids()[:])
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
>>> valian.sents('Valian/01a.xml')
[['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', 'March',
'fourth', 'I', 'believe', 'and', 'when', 'was', "Parent's", 'birthday'], ["Child's"],
['oh', "I'm", 'sorry'], ["that's", 'okay'], ...
>>> valian.words('Valian/01a.xml',speaker=['INV'])
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
>>> valian.words('Valian/01a.xml',speaker=['MOT'])
["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ...
>>> valian.words('Valian/01a.xml',speaker=['CHI'])
['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',...
>>> valian.tagged_words('Valian/01a.xml')
[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ...
>>> valian.tagged_words('Valian/01a.xml')
[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ...
>>> valian.tagged_sents('Valian/01a.xml')
[[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ...
>>> valian.words('Valian/01a.xml',speaker=['CHI'])[247]
'tikteat'
>>> valian.words('Valian/01a.xml',speaker=['CHI'],replace=True)[247]
'trick'
>>> valian.words('Valian/01a.xml',relation=True)
[[('at', 'prep', '1|9|COORD'), ('Parent', 'n:prop', '2|5|NAME'), ('Lastname', 'n:prop', '3|5|MOD'),
('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|9|COORD'),
('Child', 'n:prop', '7|9|COORD'), ('Lastname', 'n:prop', '8|9|COORD'), ('and', 'conj:coo', '9|16|COORD'),
('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v:cop', '11|9|COORD'), ('March', 'n:prop', '12|11|PRED'),
('fourth', 'adj', '13|16|COORD'), ('I', 'pro', '14|15|SUBJ'), ('believe', 'v', '15|16|COORD'),
('and', 'conj:coo', '16|0|ROOT'), ('when', 'adv:wh', '17|18|PRED'), ('be-PAST', 'v:cop', '18|16|COORD'),
('Parent', 'n:prop', '19|21|MOD'), ('s', 'poss', '20|21|MOD'), ('birthday', 'n', '21|18|SUBJ')],
[('Child', 'n:prop', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|3|COM'),
('I', 'pro:cop', '2|3|SUBJ'), ('be', 'v', '3|0|ROOT'), ('sorry', 'adj', '4|3|PRED')],
[('that', 'pro:dem', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')],
[('February', 'n:prop', '1|5|VOC'), ('first', 'adj', '2|5|ENUM'), ('nineteen', 'det:num', '3|5|ENUM'),
('eighty', 'det:num', '4|5|ENUM'), ('four', 'det:num', '5|0|ROOT')], [('great', 'adj', '1|0|ROOT')],
[('and', 'conj:coo', '1|0|ROOT'), ('she', 'pro:cop', '2|3|SUBJ'), ('be', 'v', '3|1|COORD'),
('two', 'det:num', '4|5|QUANT'), ('year-PL', 'n', '5|3|PRED'), ('old', 'adj', '6|3|PRED')],
[('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|3|SUBJ'),
('just', 'adv:int', '2|3|JCT'), ('turn-PERF', 'part', '3|0|ROOT'), ('two', 'det:num', '4|6|QUANT'),
('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]]
>>> valian.age()
['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ...
>>> valian.age('Valian/01a.xml')
['P2Y1M3D']
>>> valian.age('Valian/01a.xml',month=True)
[25]
>>> valian.MLU()
[1.8798283261802575, 1.9375, 2.6983240223463687, 2.3945945945945946, ...
>>> valian.MLU('Valian/01a.xml')
[1.8798283261802575]
Example 1 (Age and MLU)
#---------- usingCHILDESWithNLTK1.py ----------#
# Description: Processing CHILDES XML data
# Date: January 17, 2011
# Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
import nltk
from nltk.corpus.reader import CHILDESCorpusReader
corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA/')
childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
outfile = open('usingCHILDESWithNLTK1Data.txt', 'w') # open output file
print >> outfile, "\t".join(["filename","age","MLU"])
# run the entire CHILDES corpus
for filename in childes.fileids():
age = childes.age(filename,month=True)[0]
MLU = childes.MLU(filename)[0]
print >> outfile, "\t".join([filename, str(age), str(MLU)])
####################
# Plotting with R
####################
#thisData <- read.table("~/Desktop/data1.txt",sep="\t",fill=TRUE,header=TRUE)
#thisData = subset(thisData,age!="None"&age!=0&MLU!=0)
#thisData$age = as.numeric(thisData$age)
#plot(thisData$age,thisData$MLU,ylim=c(0,10),xlab="Age (in Month)",ylab="MLU",main="Age and MLU\nAll CHILDES corpora")
#abline(lm(thisData$MLU~thisData$age-1),col="red")
filename age MLU Bates/Free20/amy20.xml 20 1.14285714286 Bates/Free20/betty20.xml 20 1.14285714286 Bates/Free20/chuck20.xml 20 1.85714285714 Bates/Free20/doug20.xml 20 1.4 Bates/Free20/ed20.xml 20 1.33333333333 Bates/Free20/frank20.xml 32 1.76923076923 Bates/Free20/gloria20.xml 22 1.25 Bates/Free20/hank20.xml 20 1.5 Bates/Free20/ivy20.xml 20 4.0 Bates/Free20/jane20.xml 20 1.0 Bates/Free20/jim20.xml 20 1.41666666667 Bates/Free20/kathy20.xml 20 1.5 Bates/Free20/keith20.xml 20 2.34375 Bates/Free20/kent20.xml 20 1.0 Bates/Free20/linda20.xml 20 1.55555555556 Bates/Free20/mandy20.xml 20 1.58333333333 Bates/Free20/nan20.xml 20 1.0625 Bates/Free20/olivia20.xml 20 1.78787878788 Bates/Free20/paula20.xml 20 1.25 Bates/Free20/pete20.xml 20 1.0 ...
Example 2 (Counting modal use by age)
#---------- usingCHILDESWithNLTK2.py ----------#
# Description: Processing CHILDES XML data
# Date: January 17, 2011
# Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
import nltk
from nltk.probability import ConditionalFreqDist, FreqDist
from nltk.corpus.reader import CHILDESCorpusReader
modals = ["can","can't","could","couldn't","'ld","will","'11","won't","would","wouldn't","shall","should","shouldn't","may","might","must","ought"]
semimodal = ["gonna","wanna","hafta","gotta"]
corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA/')
childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
cfd = ConditionalFreqDist()
ages = ['1yld','2yld','3yld','4yld','5yld','6yld','7yld']
# change [:100] to [:] to run the entire CHILDES corpus
for filename in childes.fileids()[:]:
age = childes.age(filename,month=True)[0]
MLU = childes.MLU(filename)[0]
words = childes.words(filename,speaker=["CHI"])
if age is None:
pass
elif age < 12:
for word in words: cfd[ages[0]].inc(word)
elif age >= 12 and age < 24 :
for word in words: cfd[ages[1]].inc(word)
elif age >= 24 and age < 36 :
for word in words: cfd[ages[2]].inc(word)
elif age >= 36 and age < 48 :
for word in words: cfd[ages[3]].inc(word)
elif age >= 48 and age < 60 :
for word in words: cfd[ages[4]].inc(word)
elif age >= 60 and age < 72 :
for word in words: cfd[ages[5]].inc(word)
elif age >= 72 and age < 84 :
for word in words: cfd[ages[6]].inc(word)
print "age\tallTokens\t","\t".join(modals)
for thisAge in ages:
print thisAge,"\t",cfd[thisAge].N(),"\t",
for modal in modals:
print cfd[thisAge][modal],"\t",
print ""
print "age\tallTokens\t","\t".join(modals)
for thisAge in ages:
print thisAge,"\t",cfd[thisAge].N(),"\t",
for modal in modals:
try:
thisProp = float(cfd[thisAge][modal])/float(cfd[thisAge].N())*100
except:
thisProp = 0
print "%.2f%%\t" % thisProp,
print ""
age allTokens can can't could couldn't 'ld will '11 won't would wouldn't shall should shouldn't may might must ought 2yld 102494 67 47 0 0 0 15 0 3 2 0 0 0 0 6 2 2 0 3yld 525729 1570 1183 190 12 0 697 0 170 138 10 29 61 2 63 52 39 1 4yld 505159 2883 1229 1072 85 0 1265 0 368 566 46 23 175 25 116 96 52 2 5yld 332560 2279 793 640 106 0 469 0 228 406 82 5 147 15 42 87 28 0 6yld 151908 937 316 251 35 0 190 0 101 194 36 2 73 7 31 54 19 1 7yld 36053 165 75 66 15 0 30 0 25 46 9 1 21 1 14 16 4 1 age allTokens can can't could couldn't 'ld will '11 won't would wouldn't shall should shouldn't may might must ought 2yld 102494 0.07% 0.05% 0.00% 0.00% 0.00% 0.01% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.00% 0.01% 0.00% 0.00% 0.00% 3yld 525729 0.30% 0.23% 0.04% 0.00% 0.00% 0.13% 0.00% 0.03% 0.03% 0.00% 0.01% 0.01% 0.00% 0.01% 0.01% 0.01% 0.00% 4yld 505159 0.57% 0.24% 0.21% 0.02% 0.00% 0.25% 0.00% 0.07% 0.11% 0.01% 0.00% 0.03% 0.00% 0.02% 0.02% 0.01% 0.00% 5yld 332560 0.69% 0.24% 0.19% 0.03% 0.00% 0.14% 0.00% 0.07% 0.12% 0.02% 0.00% 0.04% 0.00% 0.01% 0.03% 0.01% 0.00% 6yld 151908 0.62% 0.21% 0.17% 0.02% 0.00% 0.13% 0.00% 0.07% 0.13% 0.02% 0.00% 0.05% 0.00% 0.02% 0.04% 0.01% 0.00% 7yld 36053 0.46% 0.21% 0.18% 0.04% 0.00% 0.08% 0.00% 0.07% 0.13% 0.02% 0.00% 0.06% 0.00% 0.04% 0.04% 0.01% 0.00%
Example 3 (Counting Levin's verb class)
#---------- usingCHILDESWithNLTK3.py ----------#
# Description: Processing CHILDES XML data
# Date: January 17, 2011
# Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
import nltk
from nltk.probability import ConditionalFreqDist, FreqDist
from nltk.corpus.reader import CHILDESCorpusReader
corpus_root = nltk.data.find('corpora/CHILDES/data-xml/Eng-USA/')
childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
# based on Levin (1993) and Pinker (1989; 2007)
# need some work on manner-of-motion_nonalternate
levinClassesModified = {
'verbs in the unspecified object': ['bake', 'carve', 'chop', 'clean', 'cook', 'crochet', \
'draw', 'drink', 'dust', 'eat', 'embroider', 'hum', 'hunt', 'fish', 'iron', 'knead', \
'knit', 'mend', 'milk', 'mow', 'nurse', 'pack', 'paint', 'paint', 'play', 'plow', \
'polish', 'read', 'recite', 'sew', 'sculpt', 'sing', 'sketch', 'sow', 'study', 'sweep', \
'teach', 'type', 'sketch', 'vacuum', 'wash', 'weave', 'whittle', 'write'],
'verbs in the understood body-part object': ['blink', 'clap', 'nod', 'paint', 'shrug', \
'squint', 'wag', 'wave', 'wink'],
'change-of-state': ['age', 'bend', 'blur', 'break', 'burn', 'char', 'chill', 'chip', \
'collapse', 'condense', 'contract', 'corrode', 'crack', 'crash', 'crease', 'crinkle', \
'crumble', 'crush', 'decrease', 'deflate', 'defrost', 'degrade', 'diminish', 'dissolve', \
'distend', 'divide', 'double', 'drain', 'enlarge', 'expand', 'explode', 'fade', 'fill', \
'flood', 'fold', 'fracture', 'fray', 'freeze', 'fuse', 'grow', 'halt', 'heal', 'heat', \
'ignite', 'improve', 'increase', 'inflate', 'light', 'melt', 'multiply', 'pop', \
'reproduce', 'rip', 'rumble', 'rupture', 'scorch', 'shatter', 'shrink', 'shrivel', \
'single', 'sink', 'smash', 'snap', 'soak', 'splay', 'splinter', 'split', 'sprout', \
'steep', 'stretch', 'tear', 'thaw', 'tilt', 'topple', 'warp', 'wrinkle', 'open', \
'explode', 'flood', 'dissolve', 'crack', 'harden', 'boil', 'fracture', 'solidify', \
'collapse', 'cool', 'fold', 'widen', 'change', 'clear', 'divide', 'simmer', 'stablize'],
'object-drop verbs': ['play', 'paint', 'kick', 'carve', 'reap', 'wash', 'dance', 'yell', \
'type', 'knit', 'borrow', 'inherit', 'organize', 'rent', 'sketch', 'clean', 'pack', \
'study', 'swallow', 'call'],
'verbs of disappearing': ['die', 'disappear', 'lapse', 'vanish', 'perish', 'expire', \
'blick', 'decease', 'depart', 'distinguish', 'fall_apart', 'lapse', 'pass_away', 'succumb'],
'touch verbs': ['touch', 'pat', 'stroke', 'kiss', 'pinch', 'sting', 'peck', 'lick', 'nudge', \
'prod', 'caress', 'graze', 'tickle'],
'manner-of-motion_alternate': ['bounce', 'dangle', 'drift', 'drop', 'float', 'fly', 'glide', \
'hang', 'lean', 'move', 'perch', 'rest', 'revolve', 'rock', 'roll', 'rotate', 'sit', \
'skid', 'slide', 'spin', 'stand', 'swing', 'turn', 'twist', 'whirl', 'wind', 'jump', \
'rush', 'march', 'leap', 'float', 'race', 'hurry', 'wander', 'vault', 'parade', 'gallop', \
'glide', 'hike', 'hop', 'jog', 'scoot', 'scurry', 'skip', 'tiptoe', 'trot'],
'manner-of-motion_nonalternate': ['dance', 'swim', 'climb', 'jog', 'amble'],
'verbs that emit light sound, substance': ['blaze', 'flame', 'flare', 'glare', 'gleam', \
'glisten', 'glitter', 'glow', 'shimmer', 'shine', 'sparkle', 'twinkle', 'blare', 'boom', \
'buzz', 'chatter', 'chime', 'creak', 'fizz', 'gurgle', 'hiss', 'howl', 'hum', 'peal', \
'purr', 'splutter', 'squawk', 'swoosh', 'thrum', 'vroom', 'whice', 'whump', 'zing', 'drip', \
'emanate', 'erupt', 'foam', 'gush', 'leak', 'ooze', 'puff', 'radiate', 'shed', 'spout', 'sweat'],
'hit verbs': ['hit', 'beat', 'strike', 'pound', 'tap', 'knock', 'kick', 'butt', 'drum', \
'dash', 'hammer', 'lash', 'bang', 'bump', 'smash', 'smack', 'batter', 'thump', 'rap', \
'slap', 'tamp', 'thwack', 'whack', 'bash'],
'verbs of eating': ['drink', 'eat', 'pick', 'peck', 'suck', 'lick', 'sip', 'chew', 'crunch', \
'nibble', 'chomp', 'gnaw', 'munch', 'slurp', 'swallow', 'bolt', 'wolf', 'swig', 'gulp', \
'guzzle', 'gobble', 'quaff', 'devour', 'consume', 'imbibe', 'swill', 'ingest', 'breakfast', \
'lunch', 'luncheon', 'picnic', 'banquet', 'snack', 'feast', 'dine', 'sup', 'graze', \
'brunch', 'nosh'],
'verbs of laughing': ['smile', 'cry', 'laugh', 'beam', 'grin', 'weep', 'sigh', 'glare', 'cough', \
'chuckle', 'growl', 'howl', 'whistle', 'smirk', 'grimace', 'gasp', 'snort', 'yawn', 'sniff', \
'giggle', 'titter', 'moan', 'pout', 'frown', 'groan', 'guffaw', 'cackle', 'simper', 'jeer', \
'sob', 'snivel', 'goggle', 'gape', 'glower', 'snicker', 'sneeze', 'chortle', 'snore', \
'scowl', 'snigger', 'gawk', 'tam'],
'verbs of falling': ['come', 'go', 'leave', 'return', 'fall', 'rise', 'enter', 'escape', \
'advance', 'cross', 'arrive', 'climb', 'depart', 'exit', 'plunge', 'descend', 'tumble', \
'recede', 'flee', 'ascend', 'meek'],
'cut verbs': ['saw', 'cut', 'chip', 'scratch', 'clip', 'scrape', 'hack', 'slash', 'snip', 'hew'],
'trans-intra extra': ['match', 'mix', 'cut', 'fit', 'help', 'hide', 'hug', 'keep', 'kiss', 'run','smell'],
'causative extra': ['close', 'stop', 'shut', 'bang', 'beep', 'lose', 'miss', 'pass', 'squish'],
'intransitive extra': ['listen', 'live', 'look', 'wait', 'crawl', 'step', 'walk', 'work', 'zoom'],
'object-drop extra': ['forget','bite','bump','chug','color','grab','hold','knock','know', \
'peek','puff','pull','push','rake','remember','saw','scratch','see','slip','spill','spray',\
'think','touch','try','watch']
}
child_words = childes.words(childes.fileids()[490:629], speaker=["CHI"], stem=True, strip_space=False)
adult_words = childes.words(childes.fileids()[490:629], speaker=["MOT"], stem=True, strip_space=False)
child_fd = nltk.FreqDist(child_words)
adult_fd = nltk.FreqDist(adult_words)
print '\t%20s %6s %6s %6s %6s' % ("","CHILD","ADULT","CHILD","ADULT")
print '\t%20s %6d %6d %6d %6d' % ("Total (all verbs)",child_fd.N(),adult_fd.N(),100,100)
for thisClass in levinClassesModified.keys():
print thisClass
print '\t%20s %6s %6s %6s %6s' % ("","CHILD","ADULT","CHILD","ADULT")
for thisVerb in levinClassesModified[thisClass]:
try:
childProp = float((child_fd[thisVerb])/float(child_fd.N()))*100
except ZeroDivisionError as e:
childProp = 0
try:
adultProp = float((adult_fd[thisVerb])/float(adult_fd.N()))*100
except ZeroDivisionError as e:
adultProp = 0
print '\t%20s %6d %6d %.3f%% %.3f%%' % (thisVerb,child_fd[thisVerb],adult_fd[thisVerb],childProp,adultProp)
CHILD ADULT CHILD ADULT Total (all verbs) 151906 166344 100 100 verbs in the unspecified object CHILD ADULT CHILD ADULT bake 5 2 0.003% 0.001% carve 0 0 0.000% 0.000% chop 2 0 0.001% 0.000% clean 31 26 0.020% 0.016% cook 18 28 0.012% 0.017% crochet 0 0 0.000% 0.000% draw 76 44 0.050% 0.026% drink 93 113 0.061% 0.068% dust 4 5 0.003% 0.003% eat 199 251 0.131% 0.151% embroider 0 0 0.000% 0.000% hum 1 1 0.001% 0.001% hunt 0 0 0.000% 0.000% fish 76 56 0.050% 0.034% iron 2 7 0.001% 0.004% knead 0 0 0.000% 0.000% knit 0 0 0.000% 0.000% mend 0 0 0.000% 0.000% milk 155 145 0.102% 0.087% mow 0 0 0.000% 0.000% nurse 11 5 0.007% 0.003% pack 0 13 0.000% 0.008% paint 6 10 0.004% 0.006% paint 6 10 0.004% 0.006% play 214 250 0.141% 0.150% plow 0 0 0.000% 0.000% polish 5 6 0.003% 0.004% read 5 0 0.003% 0.000% recite 0 0 0.000% 0.000% sew 2 1 0.001% 0.001% sculpt 0 0 0.000% 0.000% sing 60 125 0.039% 0.075% sketch 0 0 0.000% 0.000% sow 0 0 0.000% 0.000% study 17 13 0.011% 0.008% sweep 6 6 0.004% 0.004% teach 10 3 0.007% 0.002% type 0 0 0.000% 0.000% sketch 0 0 0.000% 0.000% vacuum 2 11 0.001% 0.007% wash 58 43 0.038% 0.026% weave 0 0 0.000% 0.000% whittle 0 0 0.000% 0.000% write 404 129 0.266% 0.078%
Example 4 (Checking word distributions of adult and child utterances)
#---------- usingCHILDESWithNLTK4.py ----------#
# Description: Processing CHILDES XML data
# Date: February 6, 2011
# Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
import sys
sys.path.append('/Library/Python/2.6/site-packages/')
import nltk
from nltk.probability import ConditionalFreqDist, FreqDist
from nltk.corpus.reader import CHILDESCorpusReader
import random
corpus_root = nltk.data.find('corpora/CHILDES/data-xml/Eng-USA/')
childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
sampleNum = 3000
child_cfd = ConditionalFreqDist()
adult_cfd = ConditionalFreqDist()
ages = ['1yld','2yld','3yld','4yld','5yld','6yld','7yld']
def make_cfd(ageKey):
if child_cfd[ages[ageKey]].N() > sampleNum:
pass
else:
words = childes.words(filename,speaker="CHI")
for word in words: child_cfd[ages[ageKey]].inc(word)
if adult_cfd[ages[ageKey]].N() > sampleNum:
pass
else:
words = childes.words(filename,speaker="MOT")
for word in words: adult_cfd[ages[ageKey]].inc(word)
# for filename in random.sample(childes.fileids(),100*7):
for filename in random.sample(childes.fileids(),500):
age = childes.age(filename,month=True)[0]
if age is None:
pass
elif age < 12:
make_cfd(0)
elif age >= 12 and age < 24 :
make_cfd(1)
elif age >= 24 and age < 36 :
make_cfd(2)
elif age >= 36 and age < 48 :
make_cfd(3)
elif age >= 48 and age < 60 :
make_cfd(4)
elif age >= 60 and age < 72 :
make_cfd(5)
elif age >= 72 and age < 84 :
make_cfd(6)
for age in ages:
filename = "child"+age+".wfl"
outfile = open(filename,"w")
outfile.write("type\tf\r")
for word in child_cfd[age].keys():
outfile.write(unicode(word).encode("utf-8")+"\t"+str(child_cfd[age][word])+"\r")
outfile.close()
for age in ages:
filename = "adult"+age+".wfl"
outfile = open(filename,"w")
outfile.write("type\tf\r")
for word in adult_cfd[age].keys():
outfile.write(unicode(word).encode("utf-8")+"\t"+str(adult_cfd[age][word])+"\r")
outfile.close()
################################################
## R procedures
################################################
## clear the cache
#rm(list = ls())
#
## change the default width
#width.default <- getOption("width"); options(width=90)
#
#library(zipfR)
#setwd("~/Desktop/")
#
#colors = topo.colors(10, alpha = 0.5)
#speakers = c("adult","child")
#ages = c('1yld','2yld','3yld','4yld','5yld','6yld','7yld')
#for (j in 1:length(speakers)){
# for (i in 1:length(ages)){
# pdf(paste(speakers[j],"_spc_",ages[i],".pdf",sep=""), width = 8, height = 6, onefile = TRUE, pointsize = 9)
# filename <- paste(speakers[j],ages[i],".wfl",sep="")
# this_spc <- tfl2spc(read.tfl(filename))
# this_lnre <- lnre("zm",this_spc)
# this_text <- paste("LNRE Model\n--------------------\n","Method: ",this_lnre$name,"\n","alpha=",format(this_lnre$param$alpha,digits=3)," B=",format(this_lnre$param$B,digits=3),"\n","Goodness-of-fit\n","X^2(df=",this_lnre$gof$df,")=",format(this_lnre$gof$X2,digits=1),", p=",format(this_lnre$gof$p,digits=1),sep="")
# plot(this_spc,main=paste("Frequency spectrum of ",speakers[j]," utterances in CHILDES\n(",ages[i],"; n=3000; randomly samplled)\nPrinted on: ",format(Sys.time(), "%b %d, %Y"),sep=""),xlab="Frequency Class, m (m=1 is hapex legomenon)",ylab="Frequency of Frequency Class, V_m",barcol=colors[i])
# mtext(this_text,side=3,adj=1,padj=2)
# dev.off()
#
# pdf(paste(speakers[j],"_vgc_",ages[i],".pdf",sep=""), width = 8, height = 6, onefile = TRUE, pointsize = 9)
# this_vgc <- vgc.interp(this_spc,(1:100)*N(this_spc)/100)
# plot(this_vgc,main=paste("Vocabulary growth of ",speakers[j]," utterances in CHILDES\n(",ages[i],"; n=3000; randomly samplled)\nPrinted on: ",format(Sys.time(), "%b %d, %Y"),sep=""),xlab="Number of words, N",ylab="Binomially interpolated vocabulary growth curve, E[V(N)]",col=colors[i])
# dev.off()
# }
#}
#
Comments and feedback