aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoris2019-03-10 19:15:45 +0100
committerJoris2019-03-10 19:15:45 +0100
commit8e03a571033a5d02d2287a2b1d1edd8b57aa2462 (patch)
tree03d6965bb65322dbac03edf2d6a30ee856c197f6
parentc41f16ed474376ad8a61a75d8b7f9ef543f359b4 (diff)
Scrap result page and make a CSV of it
-rw-r--r--.gitignore2
-rw-r--r--.tmuxinator.yml10
-rw-r--r--LICENSE30
-rw-r--r--default.nix13
-rwxr-xr-xdev18
-rw-r--r--src/CSV.hs17
-rw-r--r--src/HTTP.hs11
-rw-r--r--src/Hiking.hs42
-rw-r--r--src/Main.hs34
-rw-r--r--src/README.md5
-rw-r--r--src/Scrapper.hs75
-rw-r--r--visorando-scraper.cabal25
12 files changed, 282 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..697a2c6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+dist-newstyle
+.ghc.environment.*
diff --git a/.tmuxinator.yml b/.tmuxinator.yml
new file mode 100644
index 0000000..e81d3e8
--- /dev/null
+++ b/.tmuxinator.yml
@@ -0,0 +1,10 @@
+name: visorando-parser
+
+windows:
+ - app:
+ panes:
+ - nodemon -e hs --exec 'clear && cabal new-run --verbose=0'
+ # - ghcid --command "cabal new-repl" -T ":main"
+ # - cabal new-run
+ # - ghcid --command "cabal new-repl" -T ":main"
+ # - cabal new-build
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c6fb8b1
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2019, Joris
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of Joris nor the names of other
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/default.nix b/default.nix
new file mode 100644
index 0000000..2f1a1f8
--- /dev/null
+++ b/default.nix
@@ -0,0 +1,13 @@
+with import <nixpkgs> {}; {
+ env = stdenv.mkDerivation {
+ name = "env";
+ buildInputs = [
+ cabal-install
+ tmux
+ tmuxinator
+ nodePackages.nodemon
+ # haskellPackages.ghcid
+ zlib
+ ];
+ };
+}
diff --git a/dev b/dev
new file mode 100755
index 0000000..716b3af
--- /dev/null
+++ b/dev
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+cd "$(dirname $0)"
+CMD="$1"
+
+if [ "$CMD" = "start" ]; then
+
+ nix-shell --command "tmuxinator local"
+
+elif [ "$CMD" = "stop" ]; then
+
+ nix-shell --command "tmux kill-session -t visorando-scraper"
+
+else
+
+ echo "Usage: $0 start|stop"
+ exit 1
+
+fi
diff --git a/src/CSV.hs b/src/CSV.hs
new file mode 100644
index 0000000..9ad2974
--- /dev/null
+++ b/src/CSV.hs
@@ -0,0 +1,17 @@
+module CSV (lines, line) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import Prelude hiding (lines)
+
+lines :: [[Text]] -> Text
+lines = T.unlines . map line
+
+line :: [Text] -> Text
+line = T.intercalate "," . map value
+
+value :: Text -> Text
+value text =
+ if (T.isInfixOf "," text)
+ then T.concat [ "\"", text, "\"" ]
+ else text
diff --git a/src/HTTP.hs b/src/HTTP.hs
new file mode 100644
index 0000000..5fb3374
--- /dev/null
+++ b/src/HTTP.hs
@@ -0,0 +1,11 @@
+module HTTP (get) where
+
+import qualified Data.ByteString.Lazy as BS
+import Data.Text (Text)
+import qualified Data.Text as T
+import Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Network.HTTP.Conduit as HTTP
+
+get :: Text -> IO Text
+get url = (T.decodeUtf8 . BS.toStrict) <$> HTTP.simpleHttp (T.unpack url)
diff --git a/src/Hiking.hs b/src/Hiking.hs
new file mode 100644
index 0000000..2d04b4f
--- /dev/null
+++ b/src/Hiking.hs
@@ -0,0 +1,42 @@
+module Hiking (Hiking(..), header, values) where
+
+import Data.Text
+import qualified Data.Text as T
+
+data Hiking = Hiking
+ { name :: Text
+ , description :: Text
+ , lat :: Text
+ , lng :: Text
+ , url :: Text
+ , duration :: Text
+ , distance :: Text
+ , elevation :: Text
+ , difficulty :: Text
+ } deriving (Eq, Show)
+
+header :: [Text]
+header =
+ [ "name"
+ , "lat"
+ , "lng"
+ , "link"
+ , "durée"
+ , "distance"
+ , "dénivelé"
+ , "color"
+ , "infos"
+ ]
+
+values :: Hiking -> [Text]
+values h =
+ [ name h
+ , lat h
+ , lng h
+ , url h
+ , duration h
+ , distance h
+ , elevation h
+ , difficulty h
+ , description h
+ ]
diff --git a/src/Main.hs b/src/Main.hs
new file mode 100644
index 0000000..0d527b5
--- /dev/null
+++ b/src/Main.hs
@@ -0,0 +1,34 @@
+module Main (main) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified System.Environment as Env
+import qualified System.Exit as Exit
+
+import qualified CSV
+import Hiking (Hiking (Hiking))
+import qualified Hiking
+import qualified HTTP
+import qualified Scrapper
+
+main :: IO ()
+main = do
+ url <- getUrl
+ hikings <- getHikings url
+ T.putStrLn . CSV.lines . (:) Hiking.header . map Hiking.values $ hikings
+
+getUrl :: IO Text
+getUrl = do
+ args <- Env.getArgs
+ case args of
+ (url:_) -> return $ T.pack url
+ _ -> Exit.die "No url providen"
+
+getHikings :: Text -> IO [Hiking]
+getHikings url = do
+ urls <- Scrapper.hikingsUrls <$> HTTP.get url
+ mapM getHiking urls
+
+getHiking :: Text -> IO Hiking
+getHiking = fmap Scrapper.hiking . HTTP.get
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..03c94b4
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,5 @@
+# Visorando scrapper
+
+Produce a CSV of hikings from a [search result
+page](https://www.visorando.com/randonnee-pyrenees.html`) on
+[visorando](https://www.visorando.com/).
diff --git a/src/Scrapper.hs b/src/Scrapper.hs
new file mode 100644
index 0000000..456a455
--- /dev/null
+++ b/src/Scrapper.hs
@@ -0,0 +1,75 @@
+module Scrapper (hikingsUrls, hiking, Hiking(..)) where
+
+import qualified Data.List as List
+import qualified Data.Maybe as Maybe
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.HTML.TagSoup (Tag (TagOpen, TagText), (~/=), (~==))
+import qualified Text.HTML.TagSoup as S
+
+import Hiking (Hiking (..))
+
+hikingsUrls :: Text -> [Text]
+hikingsUrls =
+ Maybe.catMaybes
+ . map sectionUrl
+ . S.partitions (~== ("<div class=rando-title-sansDetail>" :: String))
+ . S.parseTags
+ where
+ sectionUrl =
+ fmap (T.strip . S.fromAttrib "href")
+ . Maybe.listToMaybe
+ . dropWhile (~/= ("<a>" :: String))
+
+hiking :: Text -> Hiking
+hiking page =
+ Hiking name description lat lng url duration distance elevation difficulty
+ where
+ tags = S.parseTags page
+
+ name = textInside "<h1 itemprop=name>" "</h1" tags
+
+ description = textInside "<p>" "</p>" . dropWhile (~/= ("<h1 itemprop=name>" :: String)) $ tags
+
+ (lat, lng) = latAndLng (info "Départ :")
+
+ url = S.fromAttrib "href" . head . dropWhile (~/= ("<link>" :: String)) $ tags
+
+ duration = info "Durée moyenne:"
+
+ distance = info "Distance :"
+
+ elevation = T.concat [ "+", info "Dénivelé positif :", " -", info "Dénivelé négatif :" ]
+
+ difficulty =
+ case info "Difficulté :" of
+ "Facile" -> "green"
+ "Moyenne" -> "blue"
+ "Difficile" -> "red"
+ _ -> "gray"
+
+ info title = nextTagText . drop 1 . dropWhile (not . tagTextContains title) $ tags
+
+ tagTextContains title (TagText t) = title == T.strip t
+ tagTextContains _ _ = False
+
+
+latAndLng :: Text -> (Text, Text)
+latAndLng t =
+ let [ns, lat, _, oe, lng] = T.words t
+ a = T.concat [ if ns == "S" then "-" else "", T.dropEnd 1 lat ]
+ b = T.concat [ if oe == "O" then "-" else "", T.dropEnd 1 lng ]
+ in (a, b)
+
+textInside :: Text -> Text -> [Tag Text] -> Text
+textInside from to =
+ T.unwords
+ . map (T.strip)
+ . T.lines
+ . T.strip
+ . S.innerText
+ . takeWhile (~/= T.unpack to)
+ . dropWhile (~/= T.unpack from)
+
+nextTagText :: [Tag Text] -> Text
+nextTagText = T.strip . S.innerText . take 1 . drop 1
diff --git a/visorando-scraper.cabal b/visorando-scraper.cabal
new file mode 100644
index 0000000..7dd24b6
--- /dev/null
+++ b/visorando-scraper.cabal
@@ -0,0 +1,25 @@
+name: visorando-scraper
+version: 0.1.0.0
+license: BSD3
+license-file: LICENSE
+author: Joris
+maintainer: joris@guyonvarch.me
+category: Data
+build-type: Simple
+extra-source-files: ChangeLog.md
+cabal-version: >=1.10
+
+executable visorando-scraper
+ hs-source-dirs: src
+ main-is: Main.hs
+ build-depends: base
+ , bytestring
+ , text
+ , tagsoup
+ , http-conduit
+ default-language: Haskell2010
+ default-extensions: OverloadedStrings
+ other-modules: HTTP
+ , Scrapper
+ , Hiking
+ , CSV