summaryrefslogtreecommitdiff
path: root/fuse-httpfs.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuse-httpfs.py')
-rwxr-xr-xfuse-httpfs.py308
1 files changed, 0 insertions, 308 deletions
diff --git a/fuse-httpfs.py b/fuse-httpfs.py
deleted file mode 100755
index 3fd2d28..0000000
--- a/fuse-httpfs.py
+++ /dev/null
@@ -1,308 +0,0 @@
-#!/usr/bin/env python3
-import os
-import time
-import logging
-import logging.config
-import sys
-from urllib.parse import quote, unquote
-from email.utils import parsedate
-from html.parser import HTMLParser
-from stat import S_IFDIR, S_IFREG
-from errno import EIO, ENOENT, EBADF, EHOSTUNREACH
-
-import fuse
-import requests
-
-FORMAT = "%(threadName)s %(asctime)-15s %(levelname)s:%(name)s %(message)s"
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-
-
-class Config(object):
- mountpoint = None
- timeout = (5, 25) # connect_timeout, read_timeout
-
-
-def readNetrcMachines():
- machines = []
- fh = open(os.path.expanduser("~/.netrc"), "r")
- for line in fh.readlines():
- a, b, *c = line.split(" ")
- if a == "machine" and c == []:
- machines.append(b.strip())
- logging.info("Found netrc machines: {}".format(machines))
- return machines
-
-
-class Path:
- def __init__(self, parent, name):
- self.parent = parent
- self.name = name
- self.initialized = False
-
- def buildUrl(self):
- return self.parent.buildUrl() + "/" + quote(self.name)
-
- def getSession(self):
- return self.parent.getSession()
-
- def getAttr(self):
- raise fuse.FuseOSError(ENOENT)
-
- @classmethod
- def fromPath(clazz, parent, pathElement):
- p = clazz(parent, unquote(pathElement))
- logging.info("created {} '{}' referencing {}".format(
- clazz.__name__, p.name, p.buildUrl()))
- return p
-
-
-class File(Path):
- def __init__(self, parent, name):
- super().__init__(parent, name)
- self.lastModified = None
- self.size = None
-
- def init(self):
- url = self.buildUrl()
- logging.info("File url={} name={}".format(url, self.name))
- r = self.getSession().head(url, timeout=Config.timeout)
- r.close()
- if r.status_code != 200:
- error = "Status code != 200 for {}: {}".format(url, r.status_code)
- raise Exception(error)
- self.size = int(r.headers['content-length'])
- self.lastModified = time.mktime(parsedate(r.headers['last-modified']))
-
- self.initialized = True
-
- def get(self, size, offset):
- if not self.initialized:
- self.init()
- url = self.buildUrl()
- bytesRange = '{}-{}'.format(offset, min(self.size, offset+size-1))
- headers = {'range': 'bytes=' + bytesRange}
- logging.info("File.get url={} range={}".format(url, bytesRange))
- r = self.getSession().get(url, headers=headers, timeout=Config.timeout)
- r.close()
- if r.status_code == 200 or r.status_code == 206:
- d = r.content
- logging.info("Received {} bytes".format(len(d)))
- if len(d) > size:
- errormsg = "size {} > than expected {}".format(len(d), size)
- logging.error(errormsg)
- raise fuse.FuseOSError(EIO)
- return d
- else:
- raise fuse.FuseOSError(EIO)
-
- def getAttr(self):
- if not self.initialized:
- self.init()
- t = self.lastModified
- return dict(st_mode=(S_IFREG | 0o444), st_nlink=1, st_size=self.size,
- st_ctime=t, st_mtime=t, st_atime=t)
-
-
-class Directory(Path):
- def __init__(self, parent, name):
- super().__init__(parent, name)
- self.entries = {}
-
- def init(self):
- url = self.buildUrl() + "/"
- logging.info("Directory url={} name={}".format(url, self.name))
- r = self.getSession().get(url, stream=True, timeout=Config.timeout)
- if r.status_code != 200:
- raise Exception("Status code not 200 for {}: {}".format(
- url, r.status_code))
-
- if "text/html" not in r.headers['content-type']:
- raise Exception("Is not text/html: {}".format(url))
-
- parser = RelativeLinkCollector(self)
- for line in r.iter_content(decode_unicode=True):
- parser.feed(line)
- self.entries.update(parser.entries)
- parser.entries.clear()
- parser.close()
- self.entries.update(parser.entries)
- r.close()
-
- logging.info("Diretory loaded {}".format(url))
- self.initialized = True
-
- def getAttr(self):
- t = time.time()
- nentries = 2 + len(self.entries)
- if self.initialized:
- nentries += len(self.entries)
- return dict(st_mode=(S_IFDIR | 0o555), st_nlink=nentries,
- st_ctime=t, st_mtime=t, st_atime=t)
-
-
-class Server(Directory):
- def __init__(self, parent, name):
- super().__init__(parent, name)
- self.session = requests.Session()
-
- def getSession(self):
- return self.session
-
-
-class Schema(Directory):
- def __init__(self, parent, name):
- super().__init__(parent, name)
- self.initialized = True
-
- def buildUrl(self):
- return self.name + ":/"
-
-
-class Root(Directory):
- def __init__(self):
- super().__init__(None, "")
- self.initialized = True
-
- def buildUrl(self):
- return ""
-
-
-class RelativeLinkCollector(HTMLParser):
- def __init__(self, parent):
- super().__init__(self)
- self.parent = parent
- self.entries = {}
-
- def handle_starttag(self, tag, attrs):
- if tag == "a":
- attrs = dict(attrs)
- if "href" in attrs:
- href = attrs["href"]
- if "/" in href[:-1] or href[0] == ".":
- return
-
- if href[-1:] == "/":
- d = Directory.fromPath(self.parent, href[:-1])
- self.entries[unquote(href[:-1])] = d
- else:
- f = File.fromPath(self.parent, href)
- self.entries[unquote(href)] = f
-
-
-class HttpFs(fuse.LoggingMixIn, fuse.Operations):
- """A read only http/https/ftp filesystem using python-requests."""
- def __init__(self):
- self.root = Root()
-
- https = Schema(self.root, 'https')
- https.entries = dict(self._getDefaultEntries(https))
- http = Schema(self.root, 'http')
- http.entries = dict(self._getDefaultEntries(http))
-
- self.root.entries = {'http': http, 'https': https}
-
- def _getDefaultEntries(self, parent):
- for machine in readNetrcMachines():
- yield (machine, Server(parent, machine))
-
- def getattr(self, path, fh=None):
- logging.info("getattr path={}".format(path))
- try:
- entry = self._getPath(path)
- if entry:
- return entry.getAttr()
- except Exception as e:
- logging.error("Error", e)
- raise fuse.FuseOSError(EHOSTUNREACH)
- raise fuse.FuseOSError(ENOENT)
-
- def _getPath(self, path):
- """ map path to self.root tree
- a path is build like /<schema>/<server hostname>/<http-path>"""
- logging.debug("getPath path={}".format(path))
- if path == "/":
- return self.root
-
- schema, *p = path[1:].split("/")
- if schema not in self.root.entries:
- return None
- prevEntry = self.root.entries[schema]
- if p == []:
- return prevEntry
-
- server, *p = p
- if server not in prevEntry.entries:
- # create server if not exists
- prevEntry.entries[server] = Server.fromPath(prevEntry, server)
- prevEntry = prevEntry.entries[server]
- if p == []:
- return prevEntry
-
- *pathElements, lastElement = p
- for pathElement in pathElements:
- if pathElement not in prevEntry.entries:
- d = Directory.fromPath(prevEntry, pathElement)
- prevEntry.entries[pathElement] = d
- prevEntry = prevEntry.entries[pathElement]
-
- if lastElement not in prevEntry.entries:
- if not prevEntry.initialized:
- prevEntry.init()
- if lastElement not in prevEntry.entries:
- # the server don't return it, then just create it
- # assuming its an directory, if a HEAD is successful
- d = Directory.fromPath(prevEntry, lastElement)
- r = d.getSession().head(d.buildUrl() + "/",
- timeout=Config.timeout)
- if r.status_code == 200:
- logging.info("Create directory for path which was not " +
- "discovered by Index of: {}".format(path))
- prevEntry.entries[lastElement] = d
- else:
- logging.info("Path not found: {}".format(path))
- return None
- return prevEntry.entries[lastElement]
-
- def readdir(self, path, fh):
- entry = self._getPath(path)
- if not entry:
- raise fuse.FuseOSError(EBADF)
- if not entry.initialized:
- entry.init()
- return [(".", entry.getAttr(), 0),
- ("..", (entry.parent and entry.parent.getAttr() or None), 0)] \
- + [(it.name, it.getAttr(), 0) for it in entry.entries.values()]
-
- def read(self, path, size, offset, fh):
- entry = self._getPath(path)
- if isinstance(entry, File):
- return entry.get(size, offset)
- else:
- raise fuse.FuseOSError(EIO)
-
-if __name__ == '__main__':
- import argparse
- p = argparse.ArgumentParser()
- p.add_argument("mountpoint", nargs=1, help="Target directory")
- p.add_argument("--max_background", type=int, default=15,
- help="Maximum number of background threads")
- p.add_argument("--no_foreground", action="store_true", default=False,
- help="Fork into background as a daemon")
- p.add_argument("--debug", action="store_true", help="Stay foreground")
- p.add_argument("--nothreads", action="store_true", help="Stay foreground")
- p.add_argument("--connect_timeout", type=int,
- default=Config.timeout[0], help="HTTP connect timeout")
- p.add_argument("--read_timeout", type=int,
- default=Config.timeout[1], help="HTTP read timeout")
-
- args = vars(p.parse_args(sys.argv[1:]))
- kwargs = {}
- Config.timeout = (args.pop("connect_timeout"), args.pop("read_timeout"))
- Config.mountpoint = args.pop("mountpoint")[0]
- if not args.pop("no_foreground"):
- kwargs["foreground"] = True
- if args.pop("debug"):
- kwargs["debug"] = True
- kwargs.update(args)
-
- fuse = fuse.FUSE(HttpFs(), Config.mountpoint, **kwargs)