Initial le bon coin scrapper

Retrieving the flux state in the webpage. This state contains a JSON
file containing all the informations we need.
This commit is contained in:
Félix Baylac-Jacqué 2020-06-04 23:18:40 +02:00
commit 55784c07bc
Signed by: picnoir
GPG Key ID: EFD315F31848DBA4
3 changed files with 33 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*~
result

5
default.nix Normal file
View File

@ -0,0 +1,5 @@
let
nixpkgs = import <nixpkgs> {};
pkgs = nixpkgs.pkgs;
in pkgs.writers.writePython3Bin "scrape" { libraries = [ pkgs.python3Packages.beautifulsoup4 ]; }
(builtins.readFile ./lbc.py)

26
lbc.py Executable file
View File

@ -0,0 +1,26 @@
from bs4 import BeautifulSoup
import sys
import json
from signal import signal, SIGPIPE, SIG_DFL
if __name__ == '__main__':
page = sys.stdin.read()
soup = BeautifulSoup(page, "html.parser")
script = [s.string.strip() for s in soup.find_all('script')
if s.string is not None
and s.string.strip().startswith("window.__REDIAL_PROPS__")]
if len(script) != 1:
print("Cannot find window.__REDIAL_PROPS__ script in which we are \
supposed to retrieve the data json", file=sys.stderr)
sys.exit(1)
lbcjsonstart = script[0].find('[')
lbcjson = json.loads(script[0][lbcjsonstart:])
ads = [obj["data"]["ads"] for obj in lbcjson
if type(obj) == dict and "data" in obj
and "ads" in obj["data"]]
if len(ads) != 1:
print("Cannot find the ads section in flux state")
sys.exit(1)
print(json.dumps(ads[0]))
signal(SIGPIPE, SIG_DFL)
sys.exit(0)