'''
Created on 2020-08-20
@author: wf
'''
import unittest
from corpus.datasources.wikicfp import WikiCfp
from corpus.datasources.wikicfpscrape import WikiCfpScrape,WikiCfpEventFetcher, CrawlType, CrawlBatch
import os
from collections import Counter
import jsonpickle
from datetime import datetime
import corpus.datasources.wikicfpscrape
from tests.datasourcetoolbox import DataSourceTest
[docs]class TestWikiCFP(DataSourceTest):
'''
test events from WikiCFP
'''
[docs] def setUp(self, debug:bool=False, profile:bool=True, **kwargs):
DataSourceTest.setUp(self, debug=debug, profile=profile, **kwargs)
self.wikiCFPDown=False
pass
[docs] def printDelimiterCount(self,names):
'''
print the count of the most common used delimiters in the given name list
'''
ordC=Counter()
for name in names:
if name is not None:
for char in name:
code=ord(char)
if code<ord("A"):
ordC[code]+=1
for index,countT in enumerate(ordC.most_common(10)):
code,count=countT
print ("%d: %d %s -> %d" % (index,code,chr(code),count))
[docs] def testCrawlFilesToJson(self):
'''
test getting the crawlFiles content
'''
wikiCfp=WikiCfp()
wikiCfpScrape=wikiCfp.wikiCfpScrape
expected={
"Event": 87000,
"Series": 6000
}
for crawlType in CrawlType:
jsonEm=wikiCfpScrape.crawlFilesToJson(crawlType=crawlType,withStore=False)
entityList=jsonEm.getList()
self.assertTrue(len(entityList)>expected[crawlType.value])
[docs] def testCrawledJsonFiles(self):
'''
get the crawl files
'''
wikiCfp=WikiCfp()
wikiCfpScrape=wikiCfp.wikiCfpScrape
expected={
"Event": 140,
"Series": 2
}
for crawlType in CrawlType:
crawlFiles=wikiCfpScrape.jsonFiles(crawlType)
expectedLen=expected[crawlType.value]
msg=f"found {len(crawlFiles)} wikiCFP {crawlType.value} crawl files .. expecting {expectedLen}"
print (msg)
self.assertTrue(len(crawlFiles)>=expected[crawlType.value],msg)
[docs] def testJsonPickleDateTime(self):
'''
test the JsonPickle datetime encoding mystery
'''
d=datetime.fromisoformat("2021-07-31")
dp=jsonpickle.encode(d)
if self.debug:
print(dp)
d2=jsonpickle.decode(dp)
self.assertEqual(d,d2)
[docs] def testWikiCFP(self):
'''
test event handling from WikiCFP
'''
wikiCfp=WikiCfp()
wikiCfpScrape=wikiCfp.wikiCfpScrape
jsonEm=wikiCfpScrape.cacheToJsonManager(CrawlType.EVENT)
self.assertTrue(jsonEm.isCached())
self.assertTrue(len(jsonEm.events)>80000)
names=[]
for event in jsonEm.events:
if hasattr(event, "locality"):
names.append(event.locality)
self.printDelimiterCount(names)
pass
[docs] def testInvalidUrl(self):
'''
make sure only valid urls are accepted
'''
eventFetcher=WikiCfpEventFetcher(debug=True)
try:
eventFetcher.fromUrl("http://google.com")
self.fail("invalid url should raise an exception")
except:
pass
[docs] def testEventScraping(self):
'''
test scraping the given event
test "This item has been deleted" WikiCFP items
e.g.
http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=3
'''
eventIds=[3862,1]
isDeleted=[False,True]
event=WikiCfpEventFetcher(debug=self.debug)
try:
for index,eventId in enumerate(eventIds):
rawEvent=event.fromEventId(eventId)
if self.debug:
print (rawEvent)
self.assertTrue(isDeleted[index]==rawEvent['deleted'])
except Exception as ex:
self.handleError(ex)
[docs] def testGettingEventSeriesForEvent(self):
'''
test extracting the event series id from th event page
'''
# self.debug=True
expectedSeriesId=['1769',None]
eventIds=[1974,139964]
event=WikiCfpEventFetcher(debug=self.debug,timeout=3.5)
try:
for index,eventId in enumerate(eventIds):
rawEvent=event.fromEventId(eventId)
expected=expectedSeriesId[index]
if expected:
self.assertEqual(expected,rawEvent['seriesId'])
else:
self.assertTrue('seriesId' not in rawEvent)
if self.debug:
print (f"{index}:{rawEvent}")
except Exception as ex:
self.handleError(ex)
[docs] def testGettingLatestEvent(self):
'''
get the latest event Id with a binary search
'''
#latestEvent=WikiCFPEventFetcher.getLatestEvent(showProgress=True)
pass
[docs] def testCrawlType(self):
'''
test CrawlType enumeration
'''
for crawlType in CrawlType:
if self.debug:
print(crawlType.urlPrefix)
self.assertTrue(crawlType.urlPrefix.endswith("="))
crawlBatch=CrawlBatch(1,0,1000,crawlTypeValue=crawlType.value)
bCrawlType=crawlBatch.crawlType
self.assertTrue(bCrawlType==crawlType)
self.assertTrue(bCrawlType is crawlType)
[docs] def handleError(self,ex):
'''
handle the given exception
Args:
ex(Exception): the exception to handle
'''
if self.wikiCFPDown and "timed out" in str(ex):
print("WikiCFP is down and we can't do anything about it")
else:
raise ex #self.fail(f"{str(ex)}")
[docs] def getTempJsonDir(self)->str:
jsondir=f"/tmp/wikicfp-crawl"
if not os.path.exists(jsondir):
os.makedirs(jsondir)
return jsondir
[docs] def testCrawlEvents(self):
'''
test crawling a few events and storing the result to a json file
'''
jsondir=self.getTempJsonDir()
try:
wikicfp=WikiCfp()
wikiCfpScrape=wikicfp.wikiCfpScrape
wikiCfpScrape.jsondir=jsondir
limit=10
for crawlTypeValue in [CrawlType.SERIES.value,CrawlType.EVENT.value]:
batch=CrawlBatch(1, 1, limit,crawlTypeValue,None)
batchEm=wikiCfpScrape.crawl(batch)
jsonFilePath=batchEm.getCacheFile()
size=os.stat(jsonFilePath).st_size
if self.debug:
print (f"JSON file for {crawlTypeValue} has size {size}")
self.assertTrue(size>1400)
print (f"scraped {len(batchEm.getList())} {crawlTypeValue} records")
except Exception as ex:
self.handleError(ex)
[docs] def testCrawlEventsViaCommandLine(self):
'''
test crawling via commandline
'''
jsondir=self.getTempJsonDir()
for crawlType in [CrawlType.SERIES]:
args=["--startId", "0", "--stopId", "10","-t", "1", "--targetPath",jsondir,"--crawlType",crawlType.value]
corpus.datasources.wikicfpscrape.main(args)
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
unittest.main()