Source code for tests.testWikiCfpScrape

'''
Created on 2020-08-20

@author: wf
'''
import unittest
from corpus.datasources.wikicfp import WikiCfp
from corpus.datasources.wikicfpscrape import WikiCfpScrape,WikiCfpEventFetcher, CrawlType, CrawlBatch
import os
from collections import Counter
import jsonpickle
from datetime import datetime
import corpus.datasources.wikicfpscrape
from tests.datasourcetoolbox import DataSourceTest


[docs]class TestWikiCFP(DataSourceTest): ''' test events from WikiCFP '''
[docs] def setUp(self, debug:bool=False, profile:bool=True, **kwargs): DataSourceTest.setUp(self, debug=debug, profile=profile, **kwargs) self.wikiCFPDown=False pass
[docs] def printDelimiterCount(self,names): ''' print the count of the most common used delimiters in the given name list ''' ordC=Counter() for name in names: if name is not None: for char in name: code=ord(char) if code<ord("A"): ordC[code]+=1 for index,countT in enumerate(ordC.most_common(10)): code,count=countT print ("%d: %d %s -> %d" % (index,code,chr(code),count))
[docs] def testCrawlFilesToJson(self): ''' test getting the crawlFiles content ''' wikiCfp=WikiCfp() wikiCfpScrape=wikiCfp.wikiCfpScrape expected={ "Event": 87000, "Series": 6000 } for crawlType in CrawlType: jsonEm=wikiCfpScrape.crawlFilesToJson(crawlType=crawlType,withStore=False) entityList=jsonEm.getList() self.assertTrue(len(entityList)>expected[crawlType.value])
[docs] def testCrawledJsonFiles(self): ''' get the crawl files ''' wikiCfp=WikiCfp() wikiCfpScrape=wikiCfp.wikiCfpScrape expected={ "Event": 140, "Series": 2 } for crawlType in CrawlType: crawlFiles=wikiCfpScrape.jsonFiles(crawlType) expectedLen=expected[crawlType.value] msg=f"found {len(crawlFiles)} wikiCFP {crawlType.value} crawl files .. expecting {expectedLen}" print (msg) self.assertTrue(len(crawlFiles)>=expected[crawlType.value],msg)
[docs] def testJsonPickleDateTime(self): ''' test the JsonPickle datetime encoding mystery ''' d=datetime.fromisoformat("2021-07-31") dp=jsonpickle.encode(d) if self.debug: print(dp) d2=jsonpickle.decode(dp) self.assertEqual(d,d2)
[docs] def testWikiCFP(self): ''' test event handling from WikiCFP ''' wikiCfp=WikiCfp() wikiCfpScrape=wikiCfp.wikiCfpScrape jsonEm=wikiCfpScrape.cacheToJsonManager(CrawlType.EVENT) self.assertTrue(jsonEm.isCached()) self.assertTrue(len(jsonEm.events)>80000) names=[] for event in jsonEm.events: if hasattr(event, "locality"): names.append(event.locality) self.printDelimiterCount(names) pass
[docs] def testInvalidUrl(self): ''' make sure only valid urls are accepted ''' eventFetcher=WikiCfpEventFetcher(debug=True) try: eventFetcher.fromUrl("http://google.com") self.fail("invalid url should raise an exception") except: pass
[docs] def testEventScraping(self): ''' test scraping the given event test "This item has been deleted" WikiCFP items e.g. http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=3 ''' eventIds=[3862,1] isDeleted=[False,True] event=WikiCfpEventFetcher(debug=self.debug) try: for index,eventId in enumerate(eventIds): rawEvent=event.fromEventId(eventId) if self.debug: print (rawEvent) self.assertTrue(isDeleted[index]==rawEvent['deleted']) except Exception as ex: self.handleError(ex)
[docs] def testGettingEventSeriesForEvent(self): ''' test extracting the event series id from th event page ''' # self.debug=True expectedSeriesId=['1769',None] eventIds=[1974,139964] event=WikiCfpEventFetcher(debug=self.debug,timeout=3.5) try: for index,eventId in enumerate(eventIds): rawEvent=event.fromEventId(eventId) expected=expectedSeriesId[index] if expected: self.assertEqual(expected,rawEvent['seriesId']) else: self.assertTrue('seriesId' not in rawEvent) if self.debug: print (f"{index}:{rawEvent}") except Exception as ex: self.handleError(ex)
[docs] def testGettingLatestEvent(self): ''' get the latest event Id with a binary search ''' #latestEvent=WikiCFPEventFetcher.getLatestEvent(showProgress=True) pass
[docs] def testCrawlType(self): ''' test CrawlType enumeration ''' for crawlType in CrawlType: if self.debug: print(crawlType.urlPrefix) self.assertTrue(crawlType.urlPrefix.endswith("=")) crawlBatch=CrawlBatch(1,0,1000,crawlTypeValue=crawlType.value) bCrawlType=crawlBatch.crawlType self.assertTrue(bCrawlType==crawlType) self.assertTrue(bCrawlType is crawlType)
[docs] def handleError(self,ex): ''' handle the given exception Args: ex(Exception): the exception to handle ''' if self.wikiCFPDown and "timed out" in str(ex): print("WikiCFP is down and we can't do anything about it") else: raise ex #self.fail(f"{str(ex)}")
[docs] def getTempJsonDir(self)->str: jsondir=f"/tmp/wikicfp-crawl" if not os.path.exists(jsondir): os.makedirs(jsondir) return jsondir
[docs] def testCrawlEvents(self): ''' test crawling a few events and storing the result to a json file ''' jsondir=self.getTempJsonDir() try: wikicfp=WikiCfp() wikiCfpScrape=wikicfp.wikiCfpScrape wikiCfpScrape.jsondir=jsondir limit=10 for crawlTypeValue in [CrawlType.SERIES.value,CrawlType.EVENT.value]: batch=CrawlBatch(1, 1, limit,crawlTypeValue,None) batchEm=wikiCfpScrape.crawl(batch) jsonFilePath=batchEm.getCacheFile() size=os.stat(jsonFilePath).st_size if self.debug: print (f"JSON file for {crawlTypeValue} has size {size}") self.assertTrue(size>1400) print (f"scraped {len(batchEm.getList())} {crawlTypeValue} records") except Exception as ex: self.handleError(ex)
[docs] def testCrawlEventsViaCommandLine(self): ''' test crawling via commandline ''' jsondir=self.getTempJsonDir() for crawlType in [CrawlType.SERIES]: args=["--startId", "0", "--stopId", "10","-t", "1", "--targetPath",jsondir,"--crawlType",crawlType.value] corpus.datasources.wikicfpscrape.main(args)
if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.testName'] unittest.main()