Analyzing CHILDES with NLTK

Setting up

Updates

Testing your setup


name@home:~/$ python
Python 2.6.1 (r261:67515, Jun 24 2010, 21:47:49) 
[GCC 4.2.1 (Apple Inc. build 5646)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> nltk.corpus.reader.childes.demo()

Reading bates amy20  .....
words: ["what's", 'that', 'yyy', "it's", 'a', 'chicken', 'yeah'] ...
words with replaced words: ["what's", 'that', 'yyy', "it's", 'a', 'chicken', 'yeah']  ...
words with pos tags: [("what's", 'pro:wh'), ('that', 'pro:dem'), ('yyy', 'unk'), ("it's", 'pro:cop'), ('a', 'det'), ('chicken', 'n'), ('yeah', 'co')]  ...
words (only MOT): ["what's", 'that', "it's", 'a', 'chicken', 'yeah', "what's"] ...
words (only CHI): ['yyy', 'yeah', 'yyy', 'yeah', 'xxx', 'woof', 'hat'] ...
stemmed words: ['what', 'be', 'that', 'yyy', 'it', 'be', 'a']  ...
words with relations and pos-tag: [[('what', 'pro:wh', '1|2|PRED'), ('be', 'v', '2|0|ROOT'), ('that', 'pro:dem', '3|2|SUBJ')], [('yyy', 'unk', '1|0|ROOT')], [('it', 'pro:cop', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('a', 'det', '3|4|DET'), ('chicken', 'n', '4|2|PRED')], [('yeah', 'co', '1|0|ROOT')], [('yeah', 'co', '1|0|ROOT')]]  ...
sentence: [["what's", 'that'], ['yyy']]  ...
	participant CHI group : normal
	participant CHI language : eng
	participant CHI age : P1Y8M
	participant CHI sex : female
	participant CHI role : Child
	participant CHI id : CHI
	participant MOT role : Mother
	participant MOT id : MOT
	participant MOT language : eng
num of sent: 170
num of morphemes: 296
age: ['P1Y8M']
age in month: [20]
MLU: [1.1428571428571428]

Reading bates betty20  .....
words: ['want', 'to', 'stack', 'them', 'for', 'me', 'yyy'] ...
words with replaced words: ['want', 'to', 'stack', 'them', 'for', 'me', 'yyy']  ...
words with pos tags: [('want', 'v'), ('to', 'inf'), ('stack', 'v'), ('them', 'pro'), ('for', 'prep'), ('me', 'pro'), ('yyy', 'unk')]  ...
words (only MOT): ['want', 'to', 'stack', 'them', 'for', 'me', 'oh'] ...
words (only CHI): ['yyy', 'that', 'baby', 'yyy', 'yyy', 'uh', 'xxx'] ...
stemmed words: ['want', 'to', 'stack', 'them', 'for', 'me', 'yyy']  ...
words with relations and pos-tag: [[('want', 'v', '1|0|ROOT'), ('to', 'inf', '2|3|INF'), ('stack', 'v', '3|1|XCOMP'), ('them', 'pro', '4|3|OBJ'), ('for', 'prep', '5|3|JCT'), ('me', 'pro', '6|5|POBJ')], [], [], [('yyy', 'unk', '1|0|ROOT')], []]  ...
sentence: [['want', 'to', 'stack', 'them', 'for', 'me'], []]  ...
	participant CHI group : normal
	participant CHI name : Betty
	participant CHI language : eng
	participant CHI age : P1Y8M
	participant CHI sex : female
	participant CHI role : Target_Child
	participant CHI id : CHI
	participant MOT role : Mother
	participant MOT id : MOT
	participant MOT language : eng
num of sent: 148
num of morphemes: 406
age: ['P1Y8M']
age in month: [20]
MLU: [1.1428571428571428]
...

Basic usage

  • name@home:~/$ python
    Python 2.6.1 (r261:67515, Jun 24 2010, 21:47:49) 
    [GCC 4.2.1 (Apple Inc. build 5646)] on darwin
    Type "help", "copyright", "credits" or "license" for more information.
    >>> import nltk
    >>> from nltk.corpus.reader import CHILDESCorpusReader
    >>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA/')
    
    >>> valian = CHILDESCorpusReader(corpus_root, u'Valian/.*.xml')
    >>> valian.fileids()[1:10]
        ['Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml', 'Valian/03a.xml', 'Valian/03b.xml', 'Valian/04a.xml'...
    
    >>> len(valian.fileids())
        43
    
    >>> corpus_data = valian.corpus(valian.fileids())
    >>> print corpus_data[0]
        {'Lang': 'eng', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': ...
    
    >>> for key in corpus_data[0].keys():
        ...    print key, ": ", corpus_data[0][key]
        Lang :  eng
        {http://www.w3.org/2001/XMLSchema-instance}schemaLocation :  ...
        Version :  1.5.6
        Date :  1986-03-04
        Corpus :  valian
        Id :  01a
    
    >>> corpus_participants = valian.participants(valian.fileids())
    >>> for this_corpus_participants in corpus_participants[:2]:
        ...     for key in this_corpus_participants.keys():
        ...        print key, ": ", this_corpus_participants[key]
        CHI :  defaultdict(, {'group': 'normal'...
        INV :  defaultdict(, {'role': 'Investigator'...
        MOT :  defaultdict(, {'role': 'Mother',...
    
    >>> valian.words('Valian/01a.xml')
        ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
    >>> valian.words(valian.fileids()[0])
        ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
    # this will process all files in the Valian corpus
    >>> valian.words(valian.fileids()[:])
        ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
    
    >>> valian.sents('Valian/01a.xml')
        [['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', 'March', 
        'fourth', 'I', 'believe', 'and', 'when', 'was', "Parent's", 'birthday'], ["Child's"], 
        ['oh', "I'm", 'sorry'], ["that's", 'okay'], ...
    
    >>> valian.words('Valian/01a.xml',speaker=['INV'])
        ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
    >>> valian.words('Valian/01a.xml',speaker=['MOT'])
        ["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ...
    >>> valian.words('Valian/01a.xml',speaker=['CHI'])
        ['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',...
    
    >>> valian.tagged_words('Valian/01a.xml')
        [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ...
    
    >>> valian.tagged_words('Valian/01a.xml')
        [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ...
    
    >>> valian.tagged_sents('Valian/01a.xml')
        [[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ...
    
    >>> valian.words('Valian/01a.xml',speaker=['CHI'])[247]
        'tikteat'
    >>> valian.words('Valian/01a.xml',speaker=['CHI'],replace=True)[247]
        'trick'
    
    >>> valian.words('Valian/01a.xml',relation=True) 
        [[('at', 'prep', '1|9|COORD'), ('Parent', 'n:prop', '2|5|NAME'), ('Lastname', 'n:prop', '3|5|MOD'),
        ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|9|COORD'), 
        ('Child', 'n:prop', '7|9|COORD'), ('Lastname', 'n:prop', '8|9|COORD'), ('and', 'conj:coo', '9|16|COORD'),
        ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v:cop', '11|9|COORD'), ('March', 'n:prop', '12|11|PRED'), 
        ('fourth', 'adj', '13|16|COORD'), ('I', 'pro', '14|15|SUBJ'), ('believe', 'v', '15|16|COORD'), 
        ('and', 'conj:coo', '16|0|ROOT'), ('when', 'adv:wh', '17|18|PRED'), ('be-PAST', 'v:cop', '18|16|COORD'),
        ('Parent', 'n:prop', '19|21|MOD'), ('s', 'poss', '20|21|MOD'), ('birthday', 'n', '21|18|SUBJ')], 
        [('Child', 'n:prop', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|3|COM'), 
        ('I', 'pro:cop', '2|3|SUBJ'), ('be', 'v', '3|0|ROOT'), ('sorry', 'adj', '4|3|PRED')], 
        [('that', 'pro:dem', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], 
        [('February', 'n:prop', '1|5|VOC'), ('first', 'adj', '2|5|ENUM'), ('nineteen', 'det:num', '3|5|ENUM'), 
        ('eighty', 'det:num', '4|5|ENUM'), ('four', 'det:num', '5|0|ROOT')], [('great', 'adj', '1|0|ROOT')], 
        [('and', 'conj:coo', '1|0|ROOT'), ('she', 'pro:cop', '2|3|SUBJ'), ('be', 'v', '3|1|COORD'), 
        ('two', 'det:num', '4|5|QUANT'), ('year-PL', 'n', '5|3|PRED'), ('old', 'adj', '6|3|PRED')], 
        [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|3|SUBJ'), 
        ('just', 'adv:int', '2|3|JCT'), ('turn-PERF', 'part', '3|0|ROOT'), ('two', 'det:num', '4|6|QUANT'), 
        ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]]
    
    >>> valian.age()
        ['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ...
    >>> valian.age('Valian/01a.xml')
        ['P2Y1M3D']
    >>> valian.age('Valian/01a.xml',month=True)
        [25]
    
    >>> valian.MLU()
        [1.8798283261802575, 1.9375, 2.6983240223463687, 2.3945945945945946, ...
    >>> valian.MLU('Valian/01a.xml')
        [1.8798283261802575]
    

    Example 1 (Age and MLU)

    #---------- usingCHILDESWithNLTK1.py ----------#
    # Description: Processing CHILDES XML data
    # Date: January 17, 2011
    # Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
    
    import nltk
    from nltk.corpus.reader import CHILDESCorpusReader
    
    corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
    
    outfile = open('usingCHILDESWithNLTK1Data.txt', 'w')		# open output file
    print >> outfile, "\t".join(["filename","age","MLU"])
    # run the entire CHILDES corpus
    for filename in childes.fileids():
    	age = childes.age(filename,month=True)[0]
    	MLU = childes.MLU(filename)[0]
    	print >> outfile, "\t".join([filename, str(age), str(MLU)])
    
    ####################
    # Plotting with R
    ####################
    #thisData <- read.table("~/Desktop/data1.txt",sep="\t",fill=TRUE,header=TRUE)
    #thisData = subset(thisData,age!="None"&age!=0&MLU!=0)
    #thisData$age = as.numeric(thisData$age)
    #plot(thisData$age,thisData$MLU,ylim=c(0,10),xlab="Age (in Month)",ylab="MLU",main="Age and MLU\nAll CHILDES corpora")
    #abline(lm(thisData$MLU~thisData$age-1),col="red")
    
    filename	age	MLU
    Bates/Free20/amy20.xml	20	1.14285714286
    Bates/Free20/betty20.xml	20	1.14285714286
    Bates/Free20/chuck20.xml	20	1.85714285714
    Bates/Free20/doug20.xml	20	1.4
    Bates/Free20/ed20.xml	20	1.33333333333
    Bates/Free20/frank20.xml	32	1.76923076923
    Bates/Free20/gloria20.xml	22	1.25
    Bates/Free20/hank20.xml	20	1.5
    Bates/Free20/ivy20.xml	20	4.0
    Bates/Free20/jane20.xml	20	1.0
    Bates/Free20/jim20.xml	20	1.41666666667
    Bates/Free20/kathy20.xml	20	1.5
    Bates/Free20/keith20.xml	20	2.34375
    Bates/Free20/kent20.xml	20	1.0
    Bates/Free20/linda20.xml	20	1.55555555556
    Bates/Free20/mandy20.xml	20	1.58333333333
    Bates/Free20/nan20.xml	20	1.0625
    Bates/Free20/olivia20.xml	20	1.78787878788
    Bates/Free20/paula20.xml	20	1.25
    Bates/Free20/pete20.xml	20	1.0
    ...
    

    Example 2 (Counting modal use by age)

    #---------- usingCHILDESWithNLTK2.py ----------#
    # Description: Processing CHILDES XML data
    # Date: January 17, 2011
    # Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
    
    import nltk
    from nltk.probability import ConditionalFreqDist, FreqDist
    from nltk.corpus.reader import CHILDESCorpusReader
    
    modals = ["can","can't","could","couldn't","'ld","will","'11","won't","would","wouldn't","shall","should","shouldn't","may","might","must","ought"]
    semimodal = ["gonna","wanna","hafta","gotta"]
    
    corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
    
    cfd = ConditionalFreqDist()
    
    ages = ['1yld','2yld','3yld','4yld','5yld','6yld','7yld']
    # change [:100] to [:] to run the entire CHILDES corpus
    for filename in childes.fileids()[:]:
    	age = childes.age(filename,month=True)[0]
    	MLU = childes.MLU(filename)[0]
    	words = childes.words(filename,speaker=["CHI"])
    	if age is None:
    		pass
    	elif age < 12:
    		for word in words: cfd[ages[0]].inc(word)
    	elif age >= 12 and age < 24 :
    		for word in words: cfd[ages[1]].inc(word)
    	elif age >= 24 and age < 36 :
    		for word in words: cfd[ages[2]].inc(word)
    	elif age >= 36 and age < 48 :
    		for word in words: cfd[ages[3]].inc(word)
    	elif age >= 48 and age < 60 :
    		for word in words: cfd[ages[4]].inc(word)
    	elif age >= 60 and age < 72 :
    		for word in words: cfd[ages[5]].inc(word)
    	elif age >= 72 and age < 84 :
    		for word in words: cfd[ages[6]].inc(word)
    
    print "age\tallTokens\t","\t".join(modals)
    for thisAge in ages:
    	print thisAge,"\t",cfd[thisAge].N(),"\t",
    	for modal in modals:
    		print cfd[thisAge][modal],"\t",
    	print ""
    
    print "age\tallTokens\t","\t".join(modals)
    for thisAge in ages:
    	print thisAge,"\t",cfd[thisAge].N(),"\t",
    	for modal in modals:
    		try:
    			thisProp = float(cfd[thisAge][modal])/float(cfd[thisAge].N())*100
    		except:
    			thisProp = 0
    		print "%.2f%%\t" % thisProp,
    	print ""
    
    age  allTokens  can can't  could couldn't  'ld  will  '11  won't  would wouldn't shall should shouldn't  may might must ought
    2yld    102494   67    47      0        0    0    15    0      3      2        0     0      0         0    6     2    2     0   
    3yld    525729 1570  1183    190       12    0   697    0    170    138       10    29     61         2   63    52   39     1   
    4yld    505159 2883  1229   1072       85    0  1265    0    368    566       46    23    175        25  116    96   52     2   
    5yld    332560 2279   793    640      106    0   469    0    228    406       82     5    147        15   42    87   28     0   
    6yld    151908  937   316    251       35    0   190    0    101    194       36     2     73         7   31    54   19     1   
    7yld     36053  165    75     66       15    0    30    0     25     46        9     1     21         1   14    16    4     1   
    
    age  allTokens   can    can't  could  couldn't   'ld   will    '11  won't  would  wouldn't  shall  should shouldn't  may    might  must  ought
    2yld    102494   0.07%  0.05%  0.00%  0.00%    0.00%  0.01%  0.00%  0.00%  0.00%     0.00%  0.00%  0.00%      0.00%  0.01%  0.00%  0.00%  0.00%  
    3yld    525729   0.30%  0.23%  0.04%  0.00%    0.00%  0.13%  0.00%  0.03%  0.03%     0.00%  0.01%  0.01%      0.00%  0.01%  0.01%  0.01%  0.00%  
    4yld    505159   0.57%  0.24%  0.21%  0.02%    0.00%  0.25%  0.00%  0.07%  0.11%     0.01%  0.00%  0.03%      0.00%  0.02%  0.02%  0.01%  0.00%  
    5yld    332560   0.69%  0.24%  0.19%  0.03%    0.00%  0.14%  0.00%  0.07%  0.12%     0.02%  0.00%  0.04%      0.00%  0.01%  0.03%  0.01%  0.00%  
    6yld    151908   0.62%  0.21%  0.17%  0.02%    0.00%  0.13%  0.00%  0.07%  0.13%     0.02%  0.00%  0.05%      0.00%  0.02%  0.04%  0.01%  0.00%  
    7yld     36053   0.46%  0.21%  0.18%  0.04%    0.00%  0.08%  0.00%  0.07%  0.13%     0.02%  0.00%  0.06%      0.00%  0.04%  0.04%  0.01%  0.00%  
    

    Example 3 (Counting Levin's verb class)

    #---------- usingCHILDESWithNLTK3.py ----------#
    # Description: Processing CHILDES XML data
    # Date: January 17, 2011
    # Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
    
    import nltk
    from nltk.probability import ConditionalFreqDist, FreqDist
    from nltk.corpus.reader import CHILDESCorpusReader
    
    corpus_root = nltk.data.find('corpora/CHILDES/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
    
    # based on Levin (1993) and Pinker (1989; 2007)
    # need some work on manner-of-motion_nonalternate
    levinClassesModified = {
    	'verbs in the unspecified object':  ['bake', 'carve', 'chop', 'clean', 'cook', 'crochet', \
    		'draw', 'drink', 'dust', 'eat', 'embroider', 'hum', 'hunt', 'fish', 'iron', 'knead', \
    		'knit', 'mend', 'milk', 'mow', 'nurse', 'pack', 'paint', 'paint', 'play', 'plow', \
    		'polish', 'read', 'recite', 'sew', 'sculpt', 'sing', 'sketch', 'sow', 'study', 'sweep', \
    		'teach', 'type', 'sketch', 'vacuum', 'wash', 'weave', 'whittle', 'write'],
    	'verbs in the understood body-part object':  ['blink', 'clap', 'nod', 'paint', 'shrug', \
    		'squint', 'wag', 'wave', 'wink'],
    	'change-of-state':  ['age', 'bend', 'blur', 'break', 'burn', 'char', 'chill', 'chip', \
    		'collapse', 'condense', 'contract', 'corrode', 'crack', 'crash', 'crease', 'crinkle', \
    		'crumble', 'crush', 'decrease', 'deflate', 'defrost', 'degrade', 'diminish', 'dissolve', \
    		'distend', 'divide', 'double', 'drain', 'enlarge', 'expand', 'explode', 'fade', 'fill', \
    		'flood', 'fold', 'fracture', 'fray', 'freeze', 'fuse', 'grow', 'halt', 'heal', 'heat', \
    		'ignite', 'improve', 'increase', 'inflate', 'light', 'melt', 'multiply', 'pop', \
    		'reproduce', 'rip', 'rumble', 'rupture', 'scorch', 'shatter', 'shrink', 'shrivel', \
    		'single', 'sink', 'smash', 'snap', 'soak', 'splay', 'splinter', 'split', 'sprout', \
    		'steep', 'stretch', 'tear', 'thaw', 'tilt', 'topple', 'warp', 'wrinkle', 'open', \
    		'explode', 'flood', 'dissolve', 'crack', 'harden', 'boil', 'fracture', 'solidify', \
    		'collapse', 'cool', 'fold', 'widen', 'change', 'clear', 'divide', 'simmer', 'stablize'],
    	'object-drop verbs':  ['play', 'paint', 'kick', 'carve', 'reap', 'wash', 'dance', 'yell', \
    		'type', 'knit', 'borrow', 'inherit', 'organize', 'rent', 'sketch', 'clean', 'pack', \
    		'study', 'swallow', 'call'],
    	'verbs of disappearing':  ['die', 'disappear', 'lapse', 'vanish', 'perish', 'expire', \
    		'blick', 'decease', 'depart', 'distinguish', 'fall_apart', 'lapse', 'pass_away', 'succumb'],
    	'touch verbs':  ['touch', 'pat', 'stroke', 'kiss', 'pinch', 'sting', 'peck', 'lick', 'nudge', \
    		'prod', 'caress', 'graze', 'tickle'],
    	'manner-of-motion_alternate':  ['bounce', 'dangle', 'drift', 'drop', 'float', 'fly', 'glide', \
    		'hang', 'lean', 'move', 'perch', 'rest', 'revolve', 'rock', 'roll', 'rotate', 'sit', \
    		'skid', 'slide', 'spin', 'stand', 'swing', 'turn', 'twist', 'whirl', 'wind', 'jump', \
    		'rush', 'march', 'leap', 'float', 'race', 'hurry', 'wander', 'vault', 'parade', 'gallop', \
    		'glide', 'hike', 'hop', 'jog', 'scoot', 'scurry', 'skip', 'tiptoe', 'trot'],
    	'manner-of-motion_nonalternate':  ['dance', 'swim', 'climb', 'jog', 'amble'],
    	'verbs that emit light sound, substance':  ['blaze', 'flame', 'flare', 'glare', 'gleam', \
    		'glisten', 'glitter', 'glow', 'shimmer', 'shine', 'sparkle', 'twinkle', 'blare', 'boom', \
    		'buzz', 'chatter', 'chime', 'creak', 'fizz', 'gurgle', 'hiss', 'howl', 'hum', 'peal', \
    		'purr', 'splutter', 'squawk', 'swoosh', 'thrum', 'vroom', 'whice', 'whump', 'zing', 'drip', \
    		'emanate', 'erupt', 'foam', 'gush', 'leak', 'ooze', 'puff', 'radiate', 'shed', 'spout', 'sweat'],
    	'hit verbs':  ['hit', 'beat', 'strike', 'pound', 'tap', 'knock', 'kick', 'butt', 'drum', \
    		'dash', 'hammer', 'lash', 'bang', 'bump', 'smash', 'smack', 'batter', 'thump', 'rap', \
    		'slap', 'tamp', 'thwack', 'whack', 'bash'],
    	'verbs of eating':  ['drink', 'eat', 'pick', 'peck', 'suck', 'lick', 'sip', 'chew', 'crunch', \
    		'nibble', 'chomp', 'gnaw', 'munch', 'slurp', 'swallow', 'bolt', 'wolf', 'swig', 'gulp', \
    		'guzzle', 'gobble', 'quaff', 'devour', 'consume', 'imbibe', 'swill', 'ingest', 'breakfast', \
    		'lunch', 'luncheon', 'picnic', 'banquet', 'snack', 'feast', 'dine', 'sup', 'graze', \
    		'brunch', 'nosh'],
    	'verbs of laughing':  ['smile', 'cry', 'laugh', 'beam', 'grin', 'weep', 'sigh', 'glare', 'cough', \
    		'chuckle', 'growl', 'howl', 'whistle', 'smirk', 'grimace', 'gasp', 'snort', 'yawn', 'sniff', \
    		'giggle', 'titter', 'moan', 'pout', 'frown', 'groan', 'guffaw', 'cackle', 'simper', 'jeer', \
    		'sob', 'snivel', 'goggle', 'gape', 'glower', 'snicker', 'sneeze', 'chortle', 'snore', \
    		'scowl', 'snigger', 'gawk', 'tam'],
    	'verbs of falling':  ['come', 'go', 'leave', 'return', 'fall', 'rise', 'enter', 'escape', \
    		'advance', 'cross', 'arrive', 'climb', 'depart', 'exit', 'plunge', 'descend', 'tumble', \
    		'recede', 'flee', 'ascend', 'meek'],
    	'cut verbs':  ['saw', 'cut', 'chip', 'scratch', 'clip', 'scrape', 'hack', 'slash', 'snip', 'hew'],
    	'trans-intra extra':  ['match', 'mix', 'cut', 'fit', 'help', 'hide', 'hug', 'keep', 'kiss', 'run','smell'],
    	'causative extra':  ['close', 'stop', 'shut', 'bang', 'beep', 'lose', 'miss', 'pass', 'squish'],
    	'intransitive extra':  ['listen', 'live', 'look', 'wait', 'crawl', 'step', 'walk', 'work', 'zoom'],
    	'object-drop extra':  ['forget','bite','bump','chug','color','grab','hold','knock','know', \
    		'peek','puff','pull','push','rake','remember','saw','scratch','see','slip','spill','spray',\
    		'think','touch','try','watch']
    }
    
    child_words = childes.words(childes.fileids()[490:629], speaker=["CHI"], stem=True, strip_space=False)
    adult_words = childes.words(childes.fileids()[490:629], speaker=["MOT"], stem=True, strip_space=False)
    
    child_fd = nltk.FreqDist(child_words)
    adult_fd = nltk.FreqDist(adult_words)
    print '\t%20s %6s %6s %6s   %6s' % ("","CHILD","ADULT","CHILD","ADULT")
    print '\t%20s %6d %6d %6d   %6d' % ("Total (all verbs)",child_fd.N(),adult_fd.N(),100,100)
    for thisClass in levinClassesModified.keys():
    	print thisClass
    	print '\t%20s %6s %6s   %6s   %6s' % ("","CHILD","ADULT","CHILD","ADULT")
    	for thisVerb in levinClassesModified[thisClass]:
    		try:
    			childProp = float((child_fd[thisVerb])/float(child_fd.N()))*100
    		except ZeroDivisionError as e:
    			childProp = 0
    		try:
    			adultProp = float((adult_fd[thisVerb])/float(adult_fd.N()))*100
    		except ZeroDivisionError as e:
    			adultProp = 0
    		print '\t%20s %6d %6d   %.3f%%   %.3f%%' % (thisVerb,child_fd[thisVerb],adult_fd[thisVerb],childProp,adultProp)
    
    	                      CHILD  ADULT  CHILD    ADULT
    	   Total (all verbs) 151906 166344    100      100
    verbs in the unspecified object
    	                      CHILD  ADULT    CHILD    ADULT
    	                bake      5      2   0.003%   0.001%
    	               carve      0      0   0.000%   0.000%
    	                chop      2      0   0.001%   0.000%
    	               clean     31     26   0.020%   0.016%
    	                cook     18     28   0.012%   0.017%
    	             crochet      0      0   0.000%   0.000%
    	                draw     76     44   0.050%   0.026%
    	               drink     93    113   0.061%   0.068%
    	                dust      4      5   0.003%   0.003%
    	                 eat    199    251   0.131%   0.151%
    	           embroider      0      0   0.000%   0.000%
    	                 hum      1      1   0.001%   0.001%
    	                hunt      0      0   0.000%   0.000%
    	                fish     76     56   0.050%   0.034%
    	                iron      2      7   0.001%   0.004%
    	               knead      0      0   0.000%   0.000%
    	                knit      0      0   0.000%   0.000%
    	                mend      0      0   0.000%   0.000%
    	                milk    155    145   0.102%   0.087%
    	                 mow      0      0   0.000%   0.000%
    	               nurse     11      5   0.007%   0.003%
    	                pack      0     13   0.000%   0.008%
    	               paint      6     10   0.004%   0.006%
    	               paint      6     10   0.004%   0.006%
    	                play    214    250   0.141%   0.150%
    	                plow      0      0   0.000%   0.000%
    	              polish      5      6   0.003%   0.004%
    	                read      5      0   0.003%   0.000%
    	              recite      0      0   0.000%   0.000%
    	                 sew      2      1   0.001%   0.001%
    	              sculpt      0      0   0.000%   0.000%
    	                sing     60    125   0.039%   0.075%
    	              sketch      0      0   0.000%   0.000%
    	                 sow      0      0   0.000%   0.000%
    	               study     17     13   0.011%   0.008%
    	               sweep      6      6   0.004%   0.004%
    	               teach     10      3   0.007%   0.002%
    	                type      0      0   0.000%   0.000%
    	              sketch      0      0   0.000%   0.000%
    	              vacuum      2     11   0.001%   0.007%
    	                wash     58     43   0.038%   0.026%
    	               weave      0      0   0.000%   0.000%
    	             whittle      0      0   0.000%   0.000%
    	               write    404    129   0.266%   0.078%
    

    Example 4 (Checking word distributions of adult and child utterances)

    #---------- usingCHILDESWithNLTK4.py ----------#
    # Description: Processing CHILDES XML data
    # Date: February 6, 2011
    # Written by Tomonori Nagano (tnagano@gc.cuny.edu) for Python2.5
     
    import sys
    sys.path.append('/Library/Python/2.6/site-packages/')
    
    import nltk
    from nltk.probability import ConditionalFreqDist, FreqDist
    from nltk.corpus.reader import CHILDESCorpusReader
    import random
    
    corpus_root = nltk.data.find('corpora/CHILDES/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root,u'.*.xml')
    
    sampleNum = 3000
    child_cfd = ConditionalFreqDist()
    adult_cfd = ConditionalFreqDist()
    ages = ['1yld','2yld','3yld','4yld','5yld','6yld','7yld']
    
    def make_cfd(ageKey):
    	if child_cfd[ages[ageKey]].N() > sampleNum: 
    		pass
    	else:
    		words = childes.words(filename,speaker="CHI")
    		for word in words: child_cfd[ages[ageKey]].inc(word)
    	if adult_cfd[ages[ageKey]].N() > sampleNum: 
    		pass
    	else:
    		words = childes.words(filename,speaker="MOT")
    		for word in words: adult_cfd[ages[ageKey]].inc(word)
    
    # for filename in random.sample(childes.fileids(),100*7):
    for filename in random.sample(childes.fileids(),500):
    	age = childes.age(filename,month=True)[0]
    	if age is None:
    		pass
    	elif age < 12:
    		make_cfd(0)
    	elif age >= 12 and age < 24 :
    		make_cfd(1)
    	elif age >= 24 and age < 36 :
    		make_cfd(2)
    	elif age >= 36 and age < 48 :
    		make_cfd(3)
    	elif age >= 48 and age < 60 :
    		make_cfd(4)
    	elif age >= 60 and age < 72 :
    		make_cfd(5)
    	elif age >= 72 and age < 84 :
    		make_cfd(6)
    
    
    for age in ages:
    	filename = "child"+age+".wfl"
    	outfile = open(filename,"w")
    	outfile.write("type\tf\r")
    	for word in child_cfd[age].keys():
    		outfile.write(unicode(word).encode("utf-8")+"\t"+str(child_cfd[age][word])+"\r")
    
    outfile.close()
    
    for age in ages:
    	filename = "adult"+age+".wfl"
    	outfile = open(filename,"w")
    	outfile.write("type\tf\r")
    	for word in adult_cfd[age].keys():
    		outfile.write(unicode(word).encode("utf-8")+"\t"+str(adult_cfd[age][word])+"\r")
    
    outfile.close()
    
    ################################################
    ## R procedures
    ################################################
    ## clear the cache
    #rm(list = ls())
    #
    ## change the default width
    #width.default <- getOption("width"); options(width=90)
    #
    #library(zipfR)
    #setwd("~/Desktop/")
    #
    #colors = topo.colors(10, alpha = 0.5)
    #speakers = c("adult","child")
    #ages = c('1yld','2yld','3yld','4yld','5yld','6yld','7yld')
    #for (j in 1:length(speakers)){
    #	for (i in 1:length(ages)){
    #		pdf(paste(speakers[j],"_spc_",ages[i],".pdf",sep=""), width = 8, height = 6, onefile = TRUE, pointsize = 9)
    #		filename <- paste(speakers[j],ages[i],".wfl",sep="")
    #		this_spc <- tfl2spc(read.tfl(filename))
    #		this_lnre <- lnre("zm",this_spc)
    #		this_text <- paste("LNRE Model\n--------------------\n","Method: ",this_lnre$name,"\n","alpha=",format(this_lnre$param$alpha,digits=3)," B=",format(this_lnre$param$B,digits=3),"\n","Goodness-of-fit\n","X^2(df=",this_lnre$gof$df,")=",format(this_lnre$gof$X2,digits=1),", p=",format(this_lnre$gof$p,digits=1),sep="")
    #		plot(this_spc,main=paste("Frequency spectrum of ",speakers[j]," utterances in CHILDES\n(",ages[i],"; n=3000; randomly samplled)\nPrinted on: ",format(Sys.time(), "%b %d, %Y"),sep=""),xlab="Frequency Class, m (m=1 is hapex legomenon)",ylab="Frequency of Frequency Class, V_m",barcol=colors[i])
    #		mtext(this_text,side=3,adj=1,padj=2)
    #		dev.off()
    #
    #		pdf(paste(speakers[j],"_vgc_",ages[i],".pdf",sep=""), width = 8, height = 6, onefile = TRUE, pointsize = 9)
    #		this_vgc <- vgc.interp(this_spc,(1:100)*N(this_spc)/100)
    #		plot(this_vgc,main=paste("Vocabulary growth of ",speakers[j]," utterances in CHILDES\n(",ages[i],"; n=3000; randomly samplled)\nPrinted on: ",format(Sys.time(), "%b %d, %Y"),sep=""),xlab="Number of words, N",ylab="Binomially interpolated vocabulary growth curve, E[V(N)]",col=colors[i])
    #		dev.off()
    #	}
    #}
    #
    

    Comments and feedback

    HTML Comment Box is loading comments...