Source code for tests.testWebScrape

'''
Created on 2021-07-31

@author: wf
'''
import unittest
from corpus.datasources.webscrape import WebScrape
from corpus.datasources.wikicfpscrape import CrawlType
from tests.datasourcetoolbox import DataSourceTest


[docs]class TestWebScrape(DataSourceTest): ''' test getting rdfA based triples from Webpages '''
[docs] def testCrawlType(self): ''' test CrawlType isValid ''' self.assertTrue(CrawlType.isValid("Event")) self.assertFalse(CrawlType.isValid("Homepage"))
[docs] def testWebScrape(self): ''' test getting rdfA encoded info from a webpage ''' debug=self.debug url="http://ceur-ws.org/Vol-2635/" scrape=WebScrape(timeout=20 if self.inCI() else 3) scrapeDescr=[ {'key':'acronym', 'tag':'span','attribute':'class', 'value':'CEURVOLACRONYM'}, {'key':'title', 'tag':'span','attribute':'class', 'value':'CEURFULLTITLE'}, {'key':'loctime', 'tag':'span','attribute':'class', 'value':'CEURLOCTIME'} ] scrapedDict=scrape.parseWithScrapeDescription(url,scrapeDescr) if scrape.err: print(scrape.err) print("We might not be able to do anything about it") return if debug: print(scrapedDict) self.assertEqual('DL4KG2020',scrapedDict["acronym"]) self.assertEqual('Heraklion, Greece, June 02, 2020',scrapedDict["loctime"]) self.assertEqual('Proceedings of the Workshop on Deep Learning for Knowledge Graphs (DL4KG2020)',scrapedDict["title"]) pass
if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.testName'] unittest.main()