Commit 3ec08a2e authored by Jared Bebb's avatar Jared Bebb

initial commit

parent c732ada8
# Ignore Downloaded Datasets
*RML2016.10a.tar.bz2
*RML2016.10b.tar.bz2
*RML2016.10a_dict.pkl
*RML2016.10b.dat
*LICENSE.TXT
# Ignored PyTorch Temporary Weights
*.tmp-*.pt
# Ignore Weights Generated from Examples Script
cnn.pt
# Ignore IDE Artifacts
*.vscode
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# large data files
/data
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (clustering_deep_dive)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Clustering-Semester-Project.iml" filepath="$PROJECT_DIR$/.idea/Clustering-Semester-Project.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
This diff is collapsed.
# corpus at http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html
import pandas as pd
file_path = "data/newsSpace1"
newfile_path = "data/newsSpace2"
def strip_CNET(line):
return line.replace(r'CNET', 'cnet')
def write_to_new_file():
count = 0
with open(file_path, "r", encoding="latin-1") as file, open(newfile_path, "w", encoding="latin-1") as newfile:
for line in file:
end_of_record = False
# if count >=2000000:
# break
end = line.find(r'\N')
if end > -1:
end_of_record = True
newline = line.replace(r'\r', '\n')
newline = newline.rstrip("\n\r")
# newline = strip_CNET(newline)
if end_of_record:
newline = newline + "\n"
count += 1
# if "CNET" not in newline:
# if "Guardian" not in newline:
newfile.write(str(newline))
print("count:",count)
col_names = ["Field", "Type","Nothing","Key","Default","Extra","Label", "Time",]
write_to_new_file()
# df = pd.read_table(filepath_or_buffer=newfile_path,sep="\t", index_col=None ,header=0, names=["Field1", "Type1","Nothing1","Key1","Default1","Extra1"])
df = pd.read_csv(filepath_or_buffer=newfile_path,encoding="latin-1",sep="\t",index_col=False ,header=0, names=col_names, error_bad_lines=False)
# df = pd.read_csv(newfile_path,delimiter='\t',encoding='utf-8',header=None,quoting = csv.QUOTE_NONE)
# df = pd.read_fwf(filepath_or_buffer=newfile_path)
# df = pd.read_csv(filepath_or_buffer=newfile_path)
print(len(df.columns))
print(df)
for name in col_names:
print(df[name])
df.to_pickle("data/newsSpace.pkl")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import plotly.graph_objs as go
import chart_studio.plotly as py
# import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
# df = pd.read_csv('consumer_complaints_small.csv')
df = pd.read_pickle('data/newsSpace.pkl')
print(df.head())
df.info()
col_names = ["Field", "Type","Nothing","Key","Default","Extra","Label", "Time",]
for name in col_names:
print(df[name])
# 11.
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ',
text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
text = BAD_SYMBOLS_RE.sub('',
text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
text = text.replace('x', '')
# text = re.sub(r'\W+', '', text)
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
return text
# drop nulls in text column
df = df.dropna(subset=['Extra'])
df['Extra'] = df['Extra'].apply(clean_text)
# 12.
df['Extra'] = df['Extra'].str.replace('\d+', '')
# 15.
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Extra'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# 16.
X = tokenizer.texts_to_sequences(df['Extra'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
# 17.
Y = pd.get_dummies(df['Default']).values
print('Shape of label tensor:', Y.shape)
print('Unique Label tensors:', np.unique(df['Default']))
# 18
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
# 20
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(17, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# 21
epochs = 1
batch_size = 1000
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
# 22.
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]))
# 23.
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()
# 24.
# plt.title('Accuracy')
# plt.plot(history.history['acc'], label='train')
# plt.plot(history.history['val_acc'], label='test')
# plt.legend()
# plt.show()
# 39.
# new_complaint = ['I am a victim of identity theft and someone stole my identity and personal information to open up a Visa credit card account with Bank of America. The following Bank of America Visa credit card account do not belong to me : XXXX.']
new_complaint = ['manned mission that will orbit the globe 200 times before returning to Earth']
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
# labels = ['Credit reporting, credit repair services, or other personal consumer reports', 'Debt collection', 'Mortgage', 'Credit card or prepaid card', 'Student loan', 'Bank account or service', 'Checking or savings account', 'Consumer Loan', 'Payday loan, title loan, or personal loan', 'Vehicle loan or lease', 'Money transfer, virtual currency, or money service', 'Money transfers', 'Prepaid card']
labels = [
'2007-02-09 16:33:30','2007-02-26 21:29:45','<p><a href="http://us.rd.yahoo.com/dailynews/rss/europe/*http://news.yahoo.com/s/nm/20070302/od_uk_nm/oukoe_uk_peru_time"><img src="http://d.yimg.com/us.yimg.com/p/rids/20070302/i/ra3960341363.jpg?x=130&y=89&sig=ZkU5nEV4vik6uFijBFBS1Q--" align="left" height="89" width="130" alt="Peruvian waiters run holding typical dishes during a waiters race as a part of the celebrations for Lima\'s city anniversary, in Lima, January 18, 2007. An official \'punctuality\' drive has been aunched in Peru with fanfare far and wide. REUTERS/Pilar Olivares" border="0" /></a>Reuters - Guidebooks warn tourists of Peruvian time\\-- be fashionably late for parties and dinner or risk a faux\\pas.</p><br clear="all"/>',
'Business','Entertainment','Europe','Health','Italia','Music Feeds','Sci/Tech',
'Software and Developement','Sports','Toons','Top News','Top Stories','U.S.','World'
]
print(pred, labels[np.argmax(pred)])
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment