Link Search Menu Expand Document

Generating Fake Data

It’s often useful, especially in healthcare, to generate fake data in order to test your system while protecting the privacy of your production data sets.

## let's generate a fake person
person = {}
type(person)
dict
print(person)
{}
person['age'] = 28
type(person['age'])
int
print(person)
{'age': 28}
# let's generate a random age
import random
person['age'] = random.randrange(100)
print(person)
{'age': 24}
random.randrange?
Signature: random.randrange(start, stop=None, step=1)
Docstring:
Choose a random item from range(start, stop[, step]).

This fixes the problem with randint() which includes the
endpoint; in Python this is usually not what you want.
File:      ~/opt/miniconda3/envs/cmu39/lib/python3.9/random.py
Type:      method
person['age'] = random.randrange(100)
print(person)
{'age': 5, 'employment': 40}
# in healthcare kids and elderly have special coverage so let's exclude both groups
person['age'] = random.randint(18, 65)
print(person)
{'age': 31}
person['employment'] = 'Full Time'
print(type(person['employment']))
print(person)
<class 'str'>
{'age': 5, 'employment': 'Full Time'}
# let's set employment status
employment_status = ['Full Time', 'Part Time', 'Contract', 'Seasonal', 'Unemployed', 'Retired']
person['employment'] = random.choice(employment_status)
print(person)
{'age': 5, 'employment': 'Seasonal'}
person['employment'] = random.choice(employment_status)
print(person)
{'age': 5, 'employment': 'Full Time'}
choice = random.choices(employment_status, [.5, .2, .1, .1, .05, .05])
print(f"Choice has type {type(choice)} and len {len(choice)}")
print(choice)

person['employment'] = choice[0]
print(person)
Choice has type <class 'list'> and len 1
['Full Time']
{'age': 5, 'employment': 'Full Time'}
person['employment'] = random.choices(employment_status, [.5, .2, .1, .1, .05, .05],k=1)[0]
print(person)
{'age': 5, 'employment': 'Part Time'}
def generate_employment():
    employment_status = ['Full Time', 'Part Time', 'Contract', 'Seasonal', 'Unemployed', 'Retired']
    employment = random.choices(employment_status, [.5, .2, .1, .1, .05, .05])[0]
    return employment
print(generate_employment())
print(generate_employment())
print(generate_employment())
Contract
Seasonal
Seasonal
# let's construct some fake addresses
def generate_address():
    street_number = random.randint(1, 1000)
    street_name = random.choice(['Main', 'Bluff', 'Federal'])
#     city = random.choice(['Pittsburgh', 'Cleveland' ])
#     city_details = { 'Pittsburgh' : { 'zip': 15106, 'state': 'PA' },
#                     'Cleveland' : { 'zip': 44101, 'state': 'OH' } }
#     state = city_details[city]['state']
#     zip_code = city_details[city]['zip']

    city_details = random.choice([
        { 'city': 'Pittsburgh', 'zip': 15106, 'state': 'PA' }, 
        { 'city': 'Cleveland', 'zip': 44101, 'state': 'OH' } ])
    
    city = city_details['city']
    state = city_details['state']
    zip_code = city_details['zip']
    
    address = f"{street_number} {street_name}\n{city}, {state} {zip_code}"
    return address
address = generate_address()
print(address)
257 Federal
Pittsburgh, PA 15106
def generate_name():
    first = ['Ben', 'Jen', 'Joan', 'John']
    last = ['Jones', 'Smith', 'Doe']
    return f"{random.choice(first)} {random.choice(last)}"
name = generate_name()
print(name)
Jen Doe
def generate_person():
    person = {}
    person['name'] = generate_name()
    person['address'] = generate_address()
    person['employment'] = generate_employment()
    return person
person = generate_person()
print(type(person))
print(person)
<class 'dict'>
{'name': 'Ben Doe', 'address': '905 Main\nCleveland, OH 44101', 'employment': 'Contract'}
person = generate_person()
print(person)
{'name': 'Joan Smith', 'address': '617 Bluff\nPittsburgh, PA 15106', 'employment': 'Seasonal'}
person = generate_person()
print(person)
{'name': 'Jen Doe', 'address': '83 Main\nCleveland, OH 44101', 'employment': 'Retired'}