'''
Created on 2021-01-25
@author: wf
'''
import time
from corpus.datasources.dblpxml import DblpXml
from lodstorage.schema import SchemaManager
from datetime import datetime
from lodstorage.uml import UML
from tests.datasourcetoolbox import DataSourceTest
[docs]class TestDblp(DataSourceTest):
'''
test the dblp xml parser and pylodstorage extraction for it
'''
mock=True
[docs] def setUp(self, debug:bool=False, profile:bool=True, **kwargs):
'''
setUp the test environment
especially the mocking parameter - if mock is False a multi-Gigabyte download
might be activated
'''
self.debug=debug
self.verbose=True
self.mock=TestDblp.mock
# if self.debug:
# logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
# else:
# logging.basicConfig(stream=sys.stderr, level=logging.INFO)
# self.logger=logging.getLogger("TestDblp")
pass
[docs] def tearDown(self):
pass
[docs] def log(self,msg):
if self.debug:
print(msg)
#self.logger.debug(msg)
[docs] @staticmethod
def getMockedDblp(mock=True,debug=False):
dblpXml=DblpXml(debug=debug)
if mock:
dblpXml.xmlpath="/tmp/dblp"
dblpXml.gzurl="https://github.com/WolfgangFahl/ConferenceCorpus/wiki/data/dblpsample.xml.gz"
dblpXml.reinit()
xmlfile=dblpXml.getXmlFile()
sizeMB=dblpXml.getSize()/1024/1024
if debug or not mock:
print(f"dblp xml file is {xmlfile} with size {sizeMB:5.1f} MB" )
return dblpXml
[docs] def getSqlDB(self,mock=True,recreate=False):
'''
get the Sql Database
'''
dblpXml=self.getMockedDblp(mock=mock)
limit=10000 if mock else 10000000
showProgress=not mock and not self.inCI()
sample=5
sqlDB=dblpXml.getSqlDB(limit, sample=sample, debug=self.debug,recreate=recreate,postProcess=dblpXml.postProcess,showProgress=showProgress)
return sqlDB
[docs] def testDblpDownload(self):
'''
test dblp access
'''
isMocked=TestDblp.mock
dblp=self.getMockedDblp(mock=isMocked)
minsize=988816 if isMocked else 3099271450
self.assertTrue(dblp.isDownloaded(minsize=minsize))
pass
[docs] def testCreateSample(self):
'''
test creating a sample file
'''
#isMocked=TestDblp.mock
#self.debug=True
dblpXml=self.getMockedDblp()
sampletree=dblpXml.createSample()
records=len(sampletree.getroot().getchildren())
self.log(f"sample has {records} records")
samplefile="/tmp/dblpsample.xml"
with open(samplefile,'wb') as f:
sampletree.write(f,encoding='UTF-8')
[docs] def testDblpXmlParser(self):
'''
test parsing the xml file
'''
isMocked=True
dblpXml=self.getMockedDblp()
xmlfile=dblpXml.getXmlFile()
self.assertTrue(xmlfile is not None)
index=0
starttime=time.time()
if self.debug:
showProgressAt=500000
else:
showProgressAt=5000000
for _, elem in dblpXml.iterParser():
index+=1
if index%showProgressAt==0:
elapsed=time.time()-starttime
print ("%8d: %5.1f s %5.0f/s %s" % (index,elapsed,index/elapsed,elem))
dblpXml.clear_element(elem)
expectedIndex=35000 if isMocked else 70000000
self.assertTrue(index>expectedIndex)
[docs] def checkConfColumn(self,sqlDB):
'''
check the conference columns
'''
tableDict=sqlDB.getTableDict()
self.assertTrue("proceedings in tableDict")
proceedingsTable=tableDict["proceedings"]
pcols=proceedingsTable["columns"]
self.assertTrue("conf" in pcols)
[docs] def testSqlLiteDatabaseCreation(self):
'''
get dict of list of dicts (tables)
'''
mock=self.mock
#mock=False
if not mock:
return
sqlDB=self.getSqlDB(mock=mock,recreate=True)
tableList=sqlDB.getTableList()
expected=6 if self.mock else 8
self.assertEqual(expected,len(tableList))
self.checkConfColumn(sqlDB)
sqlDB.close()
[docs] def testIssue5(self):
'''
https://github.com/WolfgangFahl/ConferenceCorpus/issues/5
dblp xml parser skips some proceedings titles
'''
xml="""<?xml version="1.0" encoding="ISO-8859-1"?>
<!DOCTYPE dblp SYSTEM "dblp.dtd">
<dblp><proceedings mdate="2019-05-14" key="conf/pfe/2001">
<editor>Frank van der Linden 0001</editor>
<title>Software Product-Family Engineering, 4th International Workshop, PFE 2001, Bilbao, Spain, October 3-5, 2001, Revised Papers</title>
<booktitle>PFE</booktitle>
<series href="db/series/lncs/index.html">Lecture Notes in Computer Science</series>
<volume>2290</volume>
<publisher>Springer</publisher>
<year>2002</year>
<isbn>3-540-43659-6</isbn>
<ee>https://doi.org/10.1007/3-540-47833-7</ee>
<url>db/conf/pfe/pfe2001.html</url>
</proceedings>
<proceedings mdate="2019-01-26" key="conf/hpcasia/2019">
<title>Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region, HPC Asia 2019, Guangzhou, China, January 14-16, 2019</title>
<publisher>ACM</publisher>
<booktitle>HPC Asia</booktitle>
<year>2019</year>
<isbn>978-1-4503-6632-8</isbn>
<ee>https://dl.acm.org/citation.cfm?id=3293320</ee>
<url>db/conf/hpcasia/hpcasia2019.html</url>
</proceedings>
<proceedings mdate="2020-03-27" key="journals/corr/OrchardY16a">
<editor orcid="0000-0002-7058-7842">Dominic A. Orchard</editor>
<editor orcid="0000-0002-3925-8557">Nobuko Yoshida</editor>
<title>Proceedings of the Ninth workshop on Programming Language Approaches to Concurrency- and Communication-cEntric Software, PLACES 2016, Eindhoven, The Netherlands, 8th April 2016.</title>
<booktitle>PLACES</booktitle>
<year>2016</year>
<series href="db/series/eptcs/index.html">EPTCS</series>
<volume>211</volume>
<url>db/series/eptcs/eptcs211.html</url>
<ee type="oa">https://doi.org/10.4204/EPTCS.211</ee>
<ee type="oa">http://arxiv.org/abs/1606.05403</ee>
</proceedings>
</dblp>"""
xmlname="dblptitleempty.xml"
xmlpath="/tmp"
with open(f"{xmlpath}/{xmlname}", 'w') as xmlfile:
xmlfile.write(xml)
dblpXml=DblpXml(xmlname=xmlname,xmlpath=xmlpath)
dictOfLod=dblpXml.asDictOfLod()
self.assertTrue("proceedings" in dictOfLod)
procs=dictOfLod["proceedings"]
self.assertEqual(3,len(procs))
self.assertTrue(procs[0]["title"].startswith("Software Product-Family Engineering"))
[docs] def testQueries(self):
'''
test the parameterized query
'''
sqlDB=self.getSqlDB(mock=TestDblp.mock,recreate=self.mock)
self.checkConfColumn(sqlDB)
query="select * from proceedings where conf=?"
records=sqlDB.query(query,('iccv',))
self.log("found %d iccv records" % len(records))
self.assertTrue(len(records)>=19)
query="select key,conf,booktitle,title from proceedings where title is null"
records=sqlDB.query(query)
if len(records)>0:
for record in records:
if self.debug:
print(record)
if len(records)>0:
print("Warning https://github.com/WolfgangFahl/ConferenceCorpus/issues/5 dblp xml parser skips some proceedings titles#5 is not fixed yet!")
#self.assertEqual(0,len(records))
[docs] def testUml(self):
'''
test generating the uml diagram for the entities
'''
sqlDB=self.getSqlDB(mock=TestDblp.mock,recreate=self.mock)
uml=UML()
now=datetime.now()
nowYMD=now.strftime("%Y-%m-%d")
title="""dblp.xml Entities
%s
[[https://dblp.org/ Copyright 2009-2021 dblp computer science bibliography]]
see also [[https://github.com/WolfgangFahl/dblpconf dblp conf open source project]]
""" %nowYMD
tableList=sqlDB.getTableList()
schemaDefs={
'article': 'Article',
'book':'Book',
'incollection': 'In Collection',
'inproceedings': 'In Proceedings',
'mastersthesis': 'Master Thesis',
'phdthesis': "PhD Thesis",
'proceedings':'Proceedings',
'www':'Person'
}
baseUrl="http://wiki.bitplan.com/index.php/Dblpconf#"
schemaManager=SchemaManager(schemaDefs=schemaDefs,baseUrl=baseUrl)
for table in tableList:
table['schema']=table['name']
countQuery="SELECT count(*) as count from %s" % table['name']
countResult=sqlDB.query(countQuery)
table['instances']=countResult[0]['count']
plantUml=uml.mergeSchema(schemaManager,tableList,title=title,packageName='dblp',generalizeTo="Record")
show=False
if show:
print(plantUml.replace('#/','#'))
self.assertTrue("Record <|-- article" in plantUml)
self.assertTrue("class Record " in plantUml)
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
DataSourceTest.main()