commit 55784c07bccb080e9e204339d2704d977407093c Author: Félix Baylac-Jacqué Date: Thu Jun 4 23:18:40 2020 +0200 Initial le bon coin scrapper Retrieving the flux state in the webpage. This state contains a JSON file containing all the informations we need. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b629ca6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*~ +result \ No newline at end of file diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..723c5d3 --- /dev/null +++ b/default.nix @@ -0,0 +1,5 @@ +let + nixpkgs = import {}; + pkgs = nixpkgs.pkgs; +in pkgs.writers.writePython3Bin "scrape" { libraries = [ pkgs.python3Packages.beautifulsoup4 ]; } + (builtins.readFile ./lbc.py) diff --git a/lbc.py b/lbc.py new file mode 100755 index 0000000..e456e33 --- /dev/null +++ b/lbc.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup +import sys +import json +from signal import signal, SIGPIPE, SIG_DFL + +if __name__ == '__main__': + page = sys.stdin.read() + soup = BeautifulSoup(page, "html.parser") + script = [s.string.strip() for s in soup.find_all('script') + if s.string is not None + and s.string.strip().startswith("window.__REDIAL_PROPS__")] + if len(script) != 1: + print("Cannot find window.__REDIAL_PROPS__ script in which we are \ + supposed to retrieve the data json", file=sys.stderr) + sys.exit(1) + lbcjsonstart = script[0].find('[') + lbcjson = json.loads(script[0][lbcjsonstart:]) + ads = [obj["data"]["ads"] for obj in lbcjson + if type(obj) == dict and "data" in obj + and "ads" in obj["data"]] + if len(ads) != 1: + print("Cannot find the ads section in flux state") + sys.exit(1) + print(json.dumps(ads[0])) + signal(SIGPIPE, SIG_DFL) + sys.exit(0)