MijnAfvalWijzerScraper

From Projects - ronaldteune.nl
Revision as of 22:02, 28 January 2016 by Ronald (Talk | contribs)

(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search

MijnAfvalWijzer scraper

Change POSTAL_CODE and NUMBER.

Updated for the changes in layout at mijnafvalwijzer.nl of around Jan 26th, 2015.

For some reason, plastic is in a different XPath than gft, paper and even christmas trees.

Create an empty sqlite db named afval.db with a db 'afval', columns 'date' and 'type'.

from lxml import html
import httplib2, re, datetime
from datetime import date
from pysqlite2 import dbapi2 as sqlite

http = httplib2.Http(timeout=5)
resp, content = http.request("http://www.mijnafvalwijzer.nl/nl/POSTAL_CODE/NUMBER/")
tree = html.fromstring(content)
items = tree.xpath('/html/body/div/div[5]/section/div[2]/div//a/p | /html/body/div/div[5]/section/div[2]/div//p')
p = re.compile('^[a-z]*?\s')

print items[0].attrib['class']
print items[0].text

con = sqlite.connect("afval.db", detect_types=sqlite.PARSE_COLNAMES)
cur = con.cursor()

for item in items:
	#aftype = item.attrib['title']
	aftype = item.attrib['class']
	if aftype == 'kerstbomen':
		continue
	dag = p.sub('',item.text)
	if 'januari' in dag: # yes, this is inefficient. But copypasting was faster than making it work the 'neat' way. All in style of the scraped web site's source. ;-)
		dag2 = datetime.date(2016,1,int(dag.replace(' januari','')))
	if 'februari' in dag:
		dag2 = datetime.date(2016,2,int(dag.replace(' februari','')))
	if 'maart' in dag:
		dag2 = datetime.date(2016,3,int(dag.replace(' maart','')))
	if 'april' in dag:
		dag2 = datetime.date(2016,4,int(dag.replace(' april','')))
	if 'mei' in dag:
		dag2 = datetime.date(2016,5,int(dag.replace(' mei','')))
	if 'juni' in dag:
		dag2 = datetime.date(2016,6,int(dag.replace(' juni','')))
	if 'juli' in dag:
		dag2 = datetime.date(2016,7,int(dag.replace(' juli','')))
	if 'augustus' in dag:
		dag2 = datetime.date(2016,8,int(dag.replace(' augustus','')))
	if 'september' in dag:
		dag2 = datetime.date(2016,9,int(dag.replace(' september','')))
	if 'oktober' in dag:
		dag2 = datetime.date(2016,10,int(dag.replace(' oktober','')))
	if 'november' in dag:
		dag2 = datetime.date(2016,11,int(dag.replace(' november','')))
	if 'december' in dag:
		dag2 = datetime.date(2016,12,int(dag.replace(' december','')))
			
	cur.execute("insert into afval(date,type) values(?, ?)", (dag2, aftype))

con.commit()
cur.close()
con.close()