Analyzing (Open) American National Corpus (OANC)

Setting up

Testing your setup

username@home:~/$ cd Desktop
username@home:~/$ ls
oanc.py
username@home:~/$ python
Python 2.6.1 (r261:67515, Jun 24 2010, 21:47:49) 
[GCC 4.2.1 (Apple Inc. build 5646)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> import oanc
>>> oanc.demo()
Counting the number of files and words. This may take a few minutes...
total number of files: 8815
spoken/face-to-face/charlotte 	face-to-face 	92 	244065
spoken/telephone/switchboard 	telephone 	2299 	4455405
written_1/fiction/eggan 	fiction 	0 	0
written_1/journal/slate 	journal 	4530 	4965835
written_1/journal/verbatim 	journal 	31 	665812
written_1/letters/icic 	letters 	244 	106737
written_2/non-fiction/OUP 	non-fiction 	44 	293752
written_2/technical/911report 	technical 	16 	335506
written_2/technical/biomed 	technical 	836 	3953949
something is wrong with  written_2/technical/government/Media/Aid_Gets_7_Million.xml
something is wrong with  written_2/technical/government/Media/Poor_Lacking_Legal_Aid.xml
written_2/technical/government 	technical 	281 	1045980
written_2/technical/plos 	technical 	251 	474667
written_2/travel_guides/berlitz1 	travel_guides 	100 	320385

Your OANC Corpus root is: /Users/tomonori/nltk_data/corpora/OANC_xml/data

Filenames (first 10): ['spoken/face-to-face/charlotte/AdamsElissa.xml', 'spoken/face-to-face/charlotte/AdamsStephanie.xml', 
	'spoken/face-to-face/charlotte/AdinolfiDavidandGail.xml', 'spoken/face-to-face/charlotte/ArguetaBertila-ENG.xml', 
	'spoken/face-to-face/charlotte/AverittShannon.xml', 'spoken/face-to-face/charlotte/BlanchardTracy.xml', 
	'spoken/face-to-face/charlotte/BorelRaymondHydeII.xml', 'spoken/face-to-face/charlotte/BorelRaymondHydeIII.xml', 
	'spoken/face-to-face/charlotte/BranchElbert.xml', 'spoken/face-to-face/charlotte/CaseCullen.xml']

Words (first 10): ['All', 'right', ',', 'this', 'is', 'Elissa', 'Adams', 'and', 'Elissa', ',', 'you', "'ve", 'lived', 'in', 
	'Charlotte', 'for--', 'Five', 'years', '.', 'Five']

Words with POS tags (first 10): [('All', 'DT'), ('right', 'NN'), (',', ','), ('this', 'DT'), ('is', 'VBZ'), ('Elissa', 'NNP'), 
	('Adams', 'NNP'), ('and', 'CC'), ('Elissa', 'NNP'), (',', ','), ('you', 'PRP'), ("'ve", 'VBP'), ('lived', 'VBN'), ('in', 'IN'), 
	('Charlotte', 'NNP'), ('for--', 'JJ'), ('Five', 'CD'), ('years', 'NNS'), ('.', '.'), ('Five', 'CD')]

Sentences (first 5): [['All', 'right', ',', 'this', 'is', 'Elissa', 'Adams', 'and', 'Elissa', ',', 'you', "'ve", 'lived', 'in', 'Charlotte', 'for--'], 
	['Five', 'years', '.'], ['Five', 'years', '.'], ['OK', '.'], ['Um', '.']]

Sentences with POS tags (first 5): [[('All', 'DT'), ('right', 'NN'), (',', ','), ('this', 'DT'), ('is', 'VBZ'), ('Elissa', 'NNP'), 
	('Adams', 'NNP'), ('and', 'CC'), ('Elissa', 'NNP'), (',', ','), ('you', 'PRP'), ("'ve", 'VBP'), ('lived', 'VBN'), ('in', 'IN'), 
	('Charlotte', 'NNP'), ('for--', 'JJ')], [('Five', 'CD'), ('years', 'NNS'), ('.', '.')], [('Five', 'CD'), ('years', 'NNS'), ('.', '.')], 
	[('OK', 'JJ'), ('.', '.')], [('Um', 'NNP'), ('.', '.')]]

OANC Corpus Structure

name@home:~/nltk_data/corpora/oanc_xml$ tree -h
.
|-- [  68]  Uninstaller
`-- [ 204]  data
    |-- [ 170]  spoken
    |   |-- [ 136]  face-to-face
    |   |   `-- [3.1K]  charlotte
    |   |       |-- [ 84K]  AdamsElissa.xml
    |   |       |-- [ 70K]  AdamsStephanie.xml
    |   |       |-- [171K]  AdinolfiDavidandGail.xml
    |   |       |-- [280K]  ArguetaBertila-ENG.xml
    |   |       |-- [169K]  AverittShannon.xml
    |   |       |-- [ 82K]  BlanchardTracy.xml
    |   |       |-- [386K]  BorelRaymondHydeII.xml
    |   |       |-- [109K]  BorelRaymondHydeIII.xml
    |   |       |-- [ 48K]  BranchElbert.xml
    |   |       |-- [ 78K]  CaseCullen.xml
    |   |       |-- [149K]  CerdaAdam.xml
    |   |       |-- [747K]  ChapmanDebbie.xml
    |   |       |-- [143K]  CheezemBurt.xml
    |   |       |-- [ 89K]  CombsJane.xml
    |   |       |-- [ 20K]  CorderoRosalinda.xml
    |   |       |-- [122K]  CottonGloria.xml
    |   |       |-- [130K]  CoxJeremy.xml
    |   |       |-- [123K]  DeLuciaBrian.xml
    |   |       |-- [110K]  EmrichDavid.xml
    |   |       |-- [ 66K]  EmrichMegan.xml
    |   |       |-- [142K]  FarrowThomas.xml
    |   |       |-- [161K]  FeberAngelina.xml
    |   |       |-- [184K]  FlonoFannie.xml
    |   |       |-- [ 44K]  FosterDimitra.xml
    |   |       |-- [271K]  FrancisClem.xml
    |   |       |-- [142K]  FunkBernie.xml
    |   |       |-- [153K]  GavinJeff.xml
    |   |       |-- [141K]  GillardSylvia.xml
    |   |       |-- [ 63K]  GoedekeThomas.xml
    |   |       |-- [125K]  GoodroeBarbara.xml
    |   |       |-- [ 88K]  GreeneKevin.xml
see all