import re
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
print(re.match(regex, "http://www.example.com") is not None) # True
print(re.match(regex, "example.com") is not None) # False
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
val = URLValidator(verify_exists=False)
try:
val('http://www.google.com')
except ValidationError, e:
print e
If you set verify_exists to True, it will actually verify that the URL exists, otherwise it will just check if it's formed correctly.
I landed on this page trying to figure out a sane way to validate strings as "valid" urls. I share here my solution using python3. No extra libraries required.
import urllib
from pprint import pprint
invalid_url = 'dkakasdkjdjakdjadjfalskdjfalk'
valid_url = 'https://stackoverflow.com'
tokens = [urllib.parse.urlparse(url) for url in (invalid_url, valid_url)]
for token in tokens:
pprint(token)
min_attributes = ('scheme', 'netloc') # add attrs to your liking
for token in tokens:
if not all([getattr(token, attr) for attr in min_attributes]):
error = "'{url}' string has no scheme or netloc.".format(url=token.geturl())
print(error)
else:
print("'{url}' is probably a valid url.".format(url=token.geturl()))
As pointed out by @Kwame , the below code does validate the url even if the .com or .co etc are not present.
also pointed out by @Blaise, URLs like https://www.google is a valid URL
and you need to do a DNS check for checking if it resolves or not, separately.
This is simple and works:
So min_attr contains the basic set of strings that needs to be present to define the validity of a URL,
i.e http:// part and google.com part.
urlparse.scheme stores http:// and
urlparse.netloc store the domain name google.com
from urlparse import urlparse
def url_check(url):
min_attr = ('scheme' , 'netloc')
try:
result = urlparse(url)
if all([result.scheme, result.netloc]):
return True
else:
return False
except:
return False
all() returns true if all the variables inside it return true.
So if result.scheme and result.netloc is present i.e. has some value then the URL is valid and hence returns True.
The Django URL validation regex was actually pretty good but I needed to tweak it a little bit for my use case. Feel free to adapt it to yours!
Python 3.7
import re
import urllib
# Check https://regex101.com/r/A326u1/5 for reference
DOMAIN_FORMAT = re.compile(
r"(?:^(\w{1,255}):(.{1,255})@|^)" # http basic authentication [optional]
r"(?:(?:(?=\S{0,253}(?:$|:))" # check full domain length to be less than or equal to 253 (starting after http basic auth, stopping before port)
r"((?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+" # check for at least one subdomain (maximum length per subdomain: 63 characters), dashes in between allowed
r"(?:[a-z0-9]{1,63})))" # check for top level domain, no dashes allowed
r"|localhost)" # accept also "localhost" only
r"(:\d{1,5})?", # port [optional]
re.IGNORECASE
)
SCHEME_FORMAT = re.compile(
r"^(http|hxxp|ftp|fxp)s?$", # scheme: http(s) or ftp(s)
re.IGNORECASE
)
def validate_url(url: str):
url = url.strip()
if not url:
raise Exception("No URL specified")
if len(url) > 2048:
raise Exception("URL exceeds its maximum length of 2048 characters (given length={})".format(len(url)))
result = urllib.parse.urlparse(url)
scheme = result.scheme
domain = result.netloc
if not scheme:
raise Exception("No URL scheme specified")
if not re.fullmatch(SCHEME_FORMAT, scheme):
raise Exception("URL scheme must either be http(s) or ftp(s) (given scheme={})".format(scheme))
if not domain:
raise Exception("No URL domain specified")
if not re.fullmatch(DOMAIN_FORMAT, domain):
raise Exception("URL domain malformed (domain={})".format(domain))
return url
Explanation
The code only validates the scheme and netloc part of a given URL. (To do this properly, I split the URL with urllib.parse.urlparse() in the two according parts which are then matched with the corresponding regex terms.)
The netloc part stops before the first occurrence of a slash /, so port numbers are still part of the netloc, e.g.:
https://www.google.com:80/search?q=python
^^^^^ ^^^^^^^^^^^^^^^^^
| |
| +-- netloc (aka "domain" in my code)
+-- scheme
IPv4 addresses are also validated
IPv6 Support
If you want the URL validator to also work with IPv6 addresses, do the following:
Add is_valid_ipv6(ip) from Markus Jarderot's answer, which has a really good IPv6 validator regex
Add and not is_valid_ipv6(domain) to the last if
Examples
Here are some examples of the regex for the netloc (aka domain) part in action:
Not directly relevant, but often it's required to identify whether some token CAN be a url or not, not necessarily 100% correctly formed (ie, https part omitted and so on). I've read this post and did not find the solution, so I am posting my own here for the sake of completeness.
def get_domain_suffixes():
import requests
res=requests.get('https://publicsuffix.org/list/public_suffix_list.dat')
lst=set()
for line in res.text.split('\n'):
if not line.startswith('//'):
domains=line.split('.')
cand=domains[-1]
if cand:
lst.add('.'+cand)
return tuple(sorted(lst))
domain_suffixes=get_domain_suffixes()
def reminds_url(txt:str):
"""
>>> reminds_url('yandex.ru.com/somepath')
True
"""
ltext=txt.lower().split('/')[0]
return ltext.startswith(('http','www','ftp')) or ltext.endswith(domain_suffixes)