Parser Tutorial

Requests GET


>>> import requests
>>> req = requests.get('http://example.com')
>>> req.text

Requests POST


>>> payload = {'key1': 'value1', 'key2': 'value2'}
>>> req = requests.post("http://httpbin.org/post", data=payload)
>>> req.text

Sample HTML


Google
Yahoo
Facebook

Find One Tag


>>> from bs4 import BeautifulSoup
>>> req = requests.get('https://raw.githubusercontent.com/FrankYang0529/Parser-Tutorial/master/parse.html')
>>> soup = BeautifulSoup(req.text, 'html.parser')
>>> div = soup.find("div")
...	Google
>>> div.a['href']
...	'http://google.com'

Find All Tag


>>> div_all = soup.find_all("div", {"class": "item"})
...	[Yahoo
, Facebook]
>>> div_all[1].a.span.text
...	'Facebook'

Async Parser

Async Sample


@asyncio.coroutine
def coro_function():
	print ("sleeping")
	yield from asyncio.sleep(3)
	print ("Done")

loop = asyncio.get_event_loop()
f = asyncio.wait([coro_function(), coro_function()])
loop.run_until_complete(f)


sleeping
sleeping
Done
Done

Async Parser


@asyncio.coroutine
def get(*args, **kwargs):
	response = yield from aiohttp.request('GET', *args, **kwargs)
	return (yield from response.read())


def first_magnet(page):
	soup = bs4.BeautifulSoup(page, 'html.parser')
	return soup.title


@asyncio.coroutine
def print_magnet(query):
	url = query
	page = yield from get(url, compress=True)
	magnet = first_magnet(page)
	print('{}: {}'.format(query, magnet))


distros = ['http://www.bbc.com/news/election-us-2016-35760148', 'http://www.bbc.com/news/world-europe-35760985', 'http://www.bbc.com/news/world-asia-35760797']
loop = asyncio.get_event_loop()
f = asyncio.wait([print_magnet(d) for d in distros])
loop.run_until_complete(f)

Async V.S General


python3 async_parse.py
0.38s user 0.04s system 59% cpu 0.701 total


python3 general_parse.py
0.42s user 0.05s system 16% cpu 2.790 total