Comparison with SQL¶
Since many potential pandas users have some familiarity with SQL, this page is meant to provide some examples of how various SQL operations would be performed using pandas.
If you’re new to pandas, you might want to first read through 10 Minutes to pandas to familiarize yourself with the library.
As is customary, we import pandas and NumPy as follows:
In [1]: import pandas as pd
In [2]: import numpy as np
Most of the examples will utilize the tips
dataset found within pandas tests. We’ll read
the data into a DataFrame called tips and assume we have a database table of the same name and
structure.
In [3]: url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv'
In [4]: tips = pd.read_csv(url)
---------------------------------------------------------------------------
ConnectionRefusedError Traceback (most recent call last)
/usr/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1323 h.request(req.get_method(), req.selector, req.data, headers,
-> 1324 encode_chunked=req.has_header('Transfer-encoding'))
1325 except OSError as err: # timeout error
/usr/lib/python3.7/http/client.py in request(self, method, url, body, headers, encode_chunked)
1243 """Send a complete request to the server."""
-> 1244 self._send_request(method, url, body, headers, encode_chunked)
1245
/usr/lib/python3.7/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1289 body = _encode(body, 'body')
-> 1290 self.endheaders(body, encode_chunked=encode_chunked)
1291
/usr/lib/python3.7/http/client.py in endheaders(self, message_body, encode_chunked)
1238 raise CannotSendHeader()
-> 1239 self._send_output(message_body, encode_chunked=encode_chunked)
1240
/usr/lib/python3.7/http/client.py in _send_output(self, message_body, encode_chunked)
1025 del self._buffer[:]
-> 1026 self.send(msg)
1027
/usr/lib/python3.7/http/client.py in send(self, data)
965 if self.auto_open:
--> 966 self.connect()
967 else:
/usr/lib/python3.7/http/client.py in connect(self)
1398
-> 1399 super().connect()
1400
/usr/lib/python3.7/http/client.py in connect(self)
937 self.sock = self._create_connection(
--> 938 (self.host,self.port), self.timeout, self.source_address)
939 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
/usr/lib/python3.7/socket.py in create_connection(address, timeout, source_address)
726 if err is not None:
--> 727 raise err
728 else:
/usr/lib/python3.7/socket.py in create_connection(address, timeout, source_address)
715 sock.bind(source_address)
--> 716 sock.connect(sa)
717 # Break explicitly a reference cycle
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
<ipython-input-4-8ab2297b7141> in <module>()
----> 1 tips = pd.read_csv(url)
/opt/build/pandas-0.23.3+dfsg/debian/python3-pandas/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
676 skip_blank_lines=skip_blank_lines)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680 parser_f.__name__ = name
/opt/build/pandas-0.23.3+dfsg/debian/python3-pandas/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
422 compression = _infer_compression(filepath_or_buffer, compression)
423 filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
--> 424 filepath_or_buffer, encoding, compression)
425 kwds['compression'] = compression
426
/opt/build/pandas-0.23.3+dfsg/debian/python3-pandas/usr/lib/python3/dist-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
193
194 if _is_url(filepath_or_buffer):
--> 195 req = _urlopen(filepath_or_buffer)
196 content_encoding = req.headers.get('Content-Encoding', None)
197 if content_encoding == 'gzip':
/usr/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
/usr/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
523 req = meth(req)
524
--> 525 response = self._open(req, data)
526
527 # post-process response
/usr/lib/python3.7/urllib/request.py in _open(self, req, data)
541 protocol = req.type
542 result = self._call_chain(self.handle_open, protocol, protocol +
--> 543 '_open', req)
544 if result:
545 return result
/usr/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
/usr/lib/python3.7/urllib/request.py in https_open(self, req)
1365 def https_open(self, req):
1366 return self.do_open(http.client.HTTPSConnection, req,
-> 1367 context=self._context, check_hostname=self._check_hostname)
1368
1369 https_request = AbstractHTTPHandler.do_request_
/usr/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1324 encode_chunked=req.has_header('Transfer-encoding'))
1325 except OSError as err: # timeout error
-> 1326 raise URLError(err)
1327 r = h.getresponse()
1328 except:
URLError: <urlopen error [Errno 111] Connection refused>
In [5]: tips.head()