Initial le bon coin scrapper
Retrieving the flux state in the webpage. This state contains a JSON file containing all the informations we need.
This commit is contained in:
commit
55784c07bc
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
*~
|
||||
result
|
5
default.nix
Normal file
5
default.nix
Normal file
|
@ -0,0 +1,5 @@
|
|||
let
|
||||
nixpkgs = import <nixpkgs> {};
|
||||
pkgs = nixpkgs.pkgs;
|
||||
in pkgs.writers.writePython3Bin "scrape" { libraries = [ pkgs.python3Packages.beautifulsoup4 ]; }
|
||||
(builtins.readFile ./lbc.py)
|
26
lbc.py
Executable file
26
lbc.py
Executable file
|
@ -0,0 +1,26 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import sys
|
||||
import json
|
||||
from signal import signal, SIGPIPE, SIG_DFL
|
||||
|
||||
if __name__ == '__main__':
|
||||
page = sys.stdin.read()
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
script = [s.string.strip() for s in soup.find_all('script')
|
||||
if s.string is not None
|
||||
and s.string.strip().startswith("window.__REDIAL_PROPS__")]
|
||||
if len(script) != 1:
|
||||
print("Cannot find window.__REDIAL_PROPS__ script in which we are \
|
||||
supposed to retrieve the data json", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
lbcjsonstart = script[0].find('[')
|
||||
lbcjson = json.loads(script[0][lbcjsonstart:])
|
||||
ads = [obj["data"]["ads"] for obj in lbcjson
|
||||
if type(obj) == dict and "data" in obj
|
||||
and "ads" in obj["data"]]
|
||||
if len(ads) != 1:
|
||||
print("Cannot find the ads section in flux state")
|
||||
sys.exit(1)
|
||||
print(json.dumps(ads[0]))
|
||||
signal(SIGPIPE, SIG_DFL)
|
||||
sys.exit(0)
|
Loading…
Reference in a new issue