In [1]:
import sqlite3
In [29]:
# Connect/Create Database
try:
    conn = sqlite3.connect('test.sqlite3')
except:
    raise

Create a Test Database

This process can be very slow for large test sets. You can expedite reducing faker calls or stacking cursor executions prior to commit, committing in larger batches. For small sets (n<10000), the following script should be reasonably quick on a modern system.

In [31]:
# Initialize the Database

create_statement = '''CREATE TABLE Students
(StudentID TEXT PRIMARY KEY NOT NULL,
FirstName CHAR(50),
LastName CHAR(50),
Age INT,
Status CHAR(10),
GPA FLOAT
);'''

try:
    conn.execute(create_statement)
except:
    print('Could Not Create Table')
    raise
In [32]:
# Generate some fake data
import numpy as np
from faker import Faker
import hashlib

fake = Faker()
m = hashlib.sha256()
In [131]:
# List of possible statuses
statuses = np.array(['enrolled', 'good_standing', 'excellent_standing', 'suspended', 'expelled', 'probation'])

# Generate a lot of students
students = []
for _ in range(10000):
    m.update(str(np.random.randint(2,1000000)).encode('ASCII'))
    students.append(
        {
        'StudentID': m.hexdigest(),
        'FirstName': fake.name().split(' ')[0],
        'LastName': fake.name().split(' ')[1],
        'Age': np.random.choice([None, np.random.randint(5,19)]),
        'GPA': np.random.choice([None, round(np.random.rand()*5, 2)]),
        'Status': np.random.choice(statuses),
    }
    )
In [132]:
# Build insert queries and write to database
cursor = conn.cursor()
for student in students:
    cursor.execute('INSERT INTO Students VALUES (:StudentID, :FirstName, :LastName, :Age, :Status, :GPA);', student)
    conn.commit()
    
cursor.close()
In [133]:
# Check the data
cursor = conn.cursor()
rows = cursor.execute('SELECT * FROM Students LIMIT 5').fetchall()
for row in rows:
    print(row)
('b46c2986250a0494233bae0f2b62e2575deb3dc31589517bc378e4a8e89e970b', 'Justin', 'Chase', 11, 'enrolled', 0.24)
('9944ccf32cccfa3173dd89825d06c8ea2547b01ea81d81c83014cbe527324301', 'Matthew', 'Moore', 10, 'enrolled', 4.3)
('4652b92d1e035963b559fe4efb13a74499d00f86524acab8f23f0f66e62176f4', 'Kenneth', 'Sanchez', 10, 'enrolled', 3.72)
('de2d48b54a7dc2d211115c91302577359299f23716226d210c19f9173b3af16c', 'Steve', 'Howard', 15, 'excellent_standing', 2.23)
('85760b45dac4cfbf1bc762d37a49b4e59edc4e9a3663201e31b9d5b63ba74f09', 'Michael', 'Lewis', 12, 'good_standing', 1.57)

Mixing Group By, Where, and Having

In [2]:
%%time
# Here is our rewrite of the MODE search.  SQLite3 does not have the MODE function, so some trickery is required.
#    To mimic the MODE function present in Oracle, PostgreSQL, and other modern RDBMS, a view is prepared 
#    for separate logic in GroupBy.

query_mode = """
WITH STATUS_VIEW AS
(SELECT Age, Status, COUNT(Status) AS Count FROM Students GROUP BY Status, Age ORDER BY Age DESC)

SELECT Students.Age, AVG(Students.GPA), s.Status FROM Students 
LEFT JOIN
(
SELECT sv.Age, sv.Status FROM STATUS_VIEW as sv GROUP BY sv.Age HAVING sv.Count = MAX(sv.Count)
) AS s
ON Students.Age=s.Age
WHERE Students.Age NOT NULL
GROUP BY Students.Age
"""

cursor = conn.cursor()
rows = cursor.execute(query_mode).fetchall()
for row in rows:
    print(row)
cursor.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<timed exec> in <module>()

NameError: name 'conn' is not defined
In [171]:
%%time
# We can modify the logic to search for other helpful parameters, like the inverse mode.
#  This functionality is not provided even in more modern RDBMS systems and must be written manually.
#  Luckily, introducing custom aggregation functions with GROUP BY and HAVING is not difficult.
#  NOTE: Oracle and other RDBMS to offer support for user-defined aggregate functions.

query_inverse_mode = """
WITH STATUS_VIEW AS
(SELECT Age, Status, COUNT(Status) AS Count FROM Students GROUP BY Status, Age ORDER BY Age DESC)

SELECT Students.Age, AVG(Students.GPA), s.Status FROM Students 
LEFT JOIN
(
SELECT sv.Age, sv.Status FROM STATUS_VIEW as sv GROUP BY sv.Age HAVING sv.Count = MIN(sv.Count)
) AS s
ON Students.Age=s.Age
WHERE Students.Age NOT NULL
GROUP BY Students.Age
"""

cursor = conn.cursor()
rows = cursor.execute(query_inverse_mode).fetchall()
for row in rows:
    print(row)
cursor.close()
(5, 2.50382258064516, 'excellent_standing')
(6, 2.5088500819568713, 'good_standing')
(7, 2.5292647427854495, 'probation')
(8, 2.4910812835560017, 'suspended')
(9, 2.5048250344914043, 'expelled')
(10, 2.4822273008029665, 'enrolled')
(11, 2.5182845498332735, 'suspended')
(12, 2.4934253326761864, 'expelled')
(13, 2.497141269449245, 'suspended')
(14, 2.521988826815644, 'excellent_standing')
(15, 2.5179627766599624, 'expelled')
(16, 2.485683087231667, 'excellent_standing')
(17, 2.510615708622402, 'excellent_standing')
(18, 2.491481388958282, 'expelled')
Wall time: 342 ms
In [173]:
# Close the connection
conn.close()
In [ ]: