import os
import sys
import unittest
import json
from StringIO import StringIO
import src.main.python.parse as parse
class KaggleRawDataParserTest(unittest.TestCase):
raw_test_file = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + "/resources/trainPostsSampleWith3Elements.json"
saved_stdout = sys.stdout
out = StringIO()
def setUp(self):
sys.argv.append(self.raw_test_file)
self.out = StringIO()
sys.stdout = self.out
def tearDown(self):
sys.argv = [sys.argv[0]]
sys.stdout = self.saved_stdout
def test_no_flags(self):
parser = parse.KaggleRawDataParser()
self.assertFalse(parser.popularity)
self.assertEqual(parser.raw_data_file, self.raw_test_file)
def test_popularity_flag(self):
sys.argv.append("-p")
parser = parse.KaggleRawDataParser()
self.assertTrue(parser.popularity)
def test_parsing_without_popularity(self):
parser = parse.KaggleRawDataParser()
parser.parse()
output_array = self.out.getvalue().strip().split('\n')
compare_with = [{
"fields": {
"author": "5",
"blog": "4",
"blogname": "Matt on Not-WordPress",
"categories": [
"Moblog"
],
"content": "",
"date": 20120328,
"date_gmt": "2012-03-28 03:36:57",
"language": "en",
"post_id": "507823",
"tags": [],
"title": "#vipworkshop dinner",
"url": "http://matt.wordpress.com/?p=3837"
},
"put": "id:blog-search:blog_post::507823"
},
{
"fields": {
"author": "5",
"blog": "4",
"blogname": "Matt on Not-WordPress",
"categories": [
"Moblog"
],
"content": "",
"date": 20120328,
"date_gmt": "2012-03-28 04:41:37",
"language": "en",
"post_id": "1406963",
"tags": [],
"title": "Oven roasted tomatoes",
"url": "http://matt.wordpress.com/?p=3839"
},
"put": "id:blog-search:blog_post::1406963"
},
{
"fields": {
"author": "5",
"blog": "4",
"blogname": "Matt on Not-WordPress",
"categories": [
"Moblog"
],
"content": "",
"date": 20120328,
"date_gmt": "2012-03-28 19:59:45",
"language": "en",
"post_id": "1329369",
"tags": [],
"title": "Fish tacos and spicy slaw",
"url": "http://matt.wordpress.com/?p=3841"
},
"put": "id:blog-search:blog_post::1329369"
}]
for i in range(0, 3):
self.assertEqual(json.loads(output_array[i]), compare_with[i])
def test_parsing_with_popularity(self):
sys.argv.append("-p")
parser = parse.KaggleRawDataParser()
parser.main()
output_array = self.out.getvalue().strip().split('\n')
compare_with = [{
"fields": {
"author": "5",
"blog": "4",
"blogname": "Matt on Not-WordPress",
"categories": [
"Moblog"
],
"content": "",
"date": 20120328,
"date_gmt": "2012-03-28 03:36:57",
"language": "en",
"popularity": 1.0,
"post_id": "507823",
"tags": [],
"title": "#vipworkshop dinner",
"url": "http://matt.wordpress.com/?p=3837"
},
"put": "id:blog-search:blog_post::507823"
},
{
"fields": {
"author": "5",
"blog": "4",
"blogname": "Matt on Not-WordPress",
"categories": [
"Moblog"
],
"content": "",
"date": 20120328,
"date_gmt": "2012-03-28 04:41:37",
"language": "en",
"popularity": 1.0,
"post_id": "1406963",
"tags": [],
"title": "Oven roasted tomatoes",
"url": "http://matt.wordpress.com/?p=3839"
},
"put": "id:blog-search:blog_post::1406963"
},
{
"fields": {
"author": "5",
"blog": "4",
"blogname": "Matt on Not-WordPress",
"categories": [
"Moblog"
],
"content": "",
"date": 20120328,
"date_gmt": "2012-03-28 19:59:45",
"language": "en",
"popularity": 1.0,
"post_id": "1329369",
"tags": [],
"title": "Fish tacos and spicy slaw",
"url": "http://matt.wordpress.com/?p=3841"
},
"put": "id:blog-search:blog_post::1329369"
}]
for i in range(0, 3):
self.assertEqual(json.loads(output_array[i]), compare_with[i])
if __name__ == '__main__':
unittest.main()