Code for Two Homologue, Poisson Distribution, Shared DNA Model, in Python

# Copyright (c) 2020 Brit Nicholson.
# All rights reserved. This program is made available under
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at
# https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt

import numpy as np
import random
import fnmatch
import time

def recomb(anc_1, anc_2, rand_fish, fun_spots):
    
    rand = random.random()
    if rand < 0.25:
        out_a = recomb_a_first(anc_1, rand_fish, fun_spots)
        out_b = recomb_a_first(anc_2, rand_fish, fun_spots)
    elif rand < 0.5:
        out_a = recomb_a_first(anc_1, rand_fish, fun_spots)
        out_b = recomb_b_first(anc_2, rand_fish, fun_spots)
    elif rand < 0.75:
        out_a = recomb_b_first(anc_1, rand_fish, fun_spots)
        out_b = recomb_a_first(anc_2, rand_fish, fun_spots)
    else:
        out_a = recomb_b_first(anc_1, rand_fish, fun_spots)
        out_b = recomb_b_first(anc_2, rand_fish, fun_spots)
        
    out = np.column_stack((out_a, out_b))
    return out


def recomb_a_first(anc, fun_rand_fish, fun_spots):
    
    random.shuffle(fun_spots)
    num_of_recombs = 22 + np.random.poisson(fun_rand_fish)
    cut_spots = sorted(fun_spots[:num_of_recombs])
    
    a = anc[:,0].tolist()
    b = anc[:,1].tolist()
    
    if len(cut_spots) == 0:
        recomb_out = a
        return recomb_out

    recomb_out = []
    for i in range(0, len(cut_spots)):
        if i == 0:
            recomb_out = a[0:cut_spots[0]]
        if i % 2 == 1:
            recomb_out.extend(b[cut_spots[i-1]:cut_spots[i]])
            last_a = False
        else:
            recomb_out.extend(a[cut_spots[i-1]:cut_spots[i]])
            last_a = True
    if last_a:
        recomb_out.extend(b[cut_spots[i]:len(b)])
    else:
        recomb_out.extend(a[cut_spots[i]:len(a)])
    return recomb_out


def recomb_b_first(anc, fun_rand_fish, fun_spots):
    
    random.shuffle(fun_spots)
    num_of_recombs = 22 + np.random.poisson(fun_rand_fish)
    cut_spots = sorted(fun_spots[:num_of_recombs])
    
    a = anc[:,0].tolist()
    b = anc[:,1].tolist()
    
    if len(cut_spots) == 0:
        recomb_out = b
        return recomb_out
    
    recomb_out = []
    for i in range(0, len(cut_spots)):
        if i == 0:
            recomb_out = b[0:cut_spots[0]]
        if i % 2 == 1:
            recomb_out.extend(a[cut_spots[i-1]:cut_spots[i]])
            last_a = True
        else:
            recomb_out.extend(b[cut_spots[i-1]:cut_spots[i]])
            last_a = False
    if last_a:
        recomb_out.extend(b[cut_spots[i]:len(b)])
    else:
        recomb_out.extend(a[cut_spots[i]:len(a)])
    return recomb_out

# Out of 1B Poisson values, the smallest I got was 28 and the largest was 94, so 99 (or 97+) genes should be fine.
genes_num = 99
genes_list = list(range(0, genes_num))
spots = list(range(1, genes_num))

rand_fish = 33 # λ

# Gen. = 2 simulates grandparents, 3 simulates great-grandparents, etc.
gen = 2

trials = 500000

# Only enter a "1" for one of the following to see what % of DNA one shares w/ that relative only.
aunt_bool = 0
cous_1st_bool = 0
great_aunt_bool = 0
par_of_cous_bool = 0
nth_cous_bool = 0
nth_anc_bool = 1

# Use this to change an nth relative to a half-relative
half = 0

# Create the farthest back ancestors. Label homolouges; otherwise, aunts share 50%.
for num in range(1, int((2**gen)/2) + 1):
    vars()['first_anc_' + str(num)] = np.column_stack(([str(num) + 'a' for i in genes_list], [str(num) + 'b' for i in genes_list]))
    
if nth_cous_bool == 1 or par_of_cous_bool == 1:
    fill = np.column_stack(([0 for i in genes_list], [0 for i in genes_list]))

start_time = time.time()

if aunt_bool + cous_1st_bool + great_aunt_bool + par_of_cous_bool + nth_cous_bool + nth_anc_bool != 1:
    raise ValueError('One and only one relative must be compared.')
    
if great_aunt_bool == 1 and gen != 3:
    raise ValueError('Great Aunts should only be compared when gen = 3.')

percent_same = []

for i_trials in range(0, trials):
    
    if gen > 2:
        for num in range(1, int((2**gen)/4) + 1):
            vars()['new_' + str(num)] = recomb(vars()['first_anc_' + str(2*num-1)], vars()['first_anc_' + str(2*num)], rand_fish, spots)
    
    # Recombine for close relatives. These 2 loops get it down to two grandparents.
    # Each anc. in the tree is created, but some are eventually overwritten.
    for g in range(gen-1, 2, -1):
        for num in range(1, int((2**g)/4) + 1):
            vars()['new_' + str(num)] = recomb(vars()['new_' + str(2*num-1)], vars()['new_' + str(2*num)], rand_fish, spots)
            
    # Create a 'mother'
    if gen > 2:
        mom = recomb(new_1, new_2, rand_fish, spots)
    else:
        mom = recomb(first_anc_1, first_anc_2, rand_fish, spots)

    # Create oneself (only the applicable homologue, which is arbitrarily the maternal one)
    rand = random.random()
    if rand < 0.5:
        me = recomb_a_first(mom, rand_fish, spots)
    else:
        me = recomb_b_first(mom, rand_fish, spots)
                
    me = np.asarray(me)
    
    if nth_anc_bool == 1:
        percent_same.append(np.sum((me == first_anc_2[:,0]) | (me == first_anc_2[:,1]))/(2*genes_num))
    
    elif aunt_bool == 1 or cous_1st_bool == 1:
    # Create an 'aunt'
        if gen > 2:
            aunt = recomb(new_1, new_2, rand_fish, spots)
        else:
            aunt = recomb(first_anc_1, first_anc_2, rand_fish, spots)
        
        if aunt_bool == 1:
            percent_same.append(np.sum((me == aunt[:,0]) | (me == aunt[:,1]))/(2*genes_num))
        
        else:
            rand = random.random()
            if rand < 0.5:
                cous_1st = recomb_a_first(aunt, rand_fish, spots)
            else:
                cous_1st = recomb_b_first(aunt, rand_fish, spots)
            cous_1st = np.asarray(cous_1st)
            percent_same.append(np.sum((me == cous_1st))/(2*genes_num))
            
    elif gen == 3 and great_aunt_bool == 1:
        # To keep the programming simple and limit run-time, only allow calculations for great-aunts
        # and great-uncles (siblings of one's grandparents) when gen == 3
        great_aunt = recomb(first_anc_1, first_anc_2, rand_fish, spots)
        percent_same.append(np.sum((me == great_aunt[:,0]) | (me == great_aunt[:,1]))/(2*genes_num))
    
    # Recombine for nth cousin or their parent
    elif nth_cous_bool == 1 or par_of_cous_bool == 1:
        
        # One only shares 2 close ancestors with a full nth cousin (unless they share more /:)
        if half == 0:
            new_1 = recomb(first_anc_1, first_anc_2, rand_fish, spots)
        else:
            new_1 = recomb(first_anc_1, fill, rand_fish, spots)
        
        # If gen > 3, dilute half of the genes with 'fill' for each extra generation
        for g in range(gen - 1, 2, -1):
        
            new_1 = recomb(new_1, fill, rand_fish, spots)
        
        # Create a parent of nth cousin (where n = gen - 1)
        if gen > 2:
            par_of_cous = recomb(new_1, fill, rand_fish, spots)
        else:
            par_of_cous = new_1
            #par_of_cous = recomb(new_1, fill, rand_fish, spots)
        
        if par_of_cous_bool == 1:
            percent_same.append(np.sum((me == par_of_cous[:,0]) | (me == par_of_cous[:,1]))/(2*genes_num))
        
        else: # Create an nth cousin (where n = gen - 1)
            rand = random.random()
            if rand < 0.5:
                nth_cous = recomb_a_first(par_of_cous, rand_fish, spots)
            else:
                nth_cous = recomb_b_first(par_of_cous, rand_fish, spots)
            percent_same.append(np.sum((me == nth_cous))/(2*genes_num))
    
    # Calculate the fraction of DNA shared
    #percent_same.append(np.sum((me == mom[:,0]) | (me == mom[:,1]))/(2*genes_num))

if half == 1:
    print('This is for a half-relative!', "n")
    
mean_out = np.mean(percent_same)
std_dev_out = np.std(percent_same)
print(mean_out)
print(std_dev_out)

print("n")

print("Time to complete: %s seconds." % (time.time() - start_time))

import statistics

def print_results(pv):
    print(':', str(round(100*statistics.mean(pv),2)) + '%,',
          'min:', str(round(min(pv),5)) + ',', '95%-Conf.:', str(round(np.percentile(pv, lo),4)) + '-' + str(round(np.percentile(pv, hi),4)) + ',',
          'max:', str(round(max(pv),5)) + ',', round(statistics.stdev(pv),5), 'std,',
          pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
    
confid = 95
lo = (100 - confid) / 2
hi = 100 - lo
    
print_results(percent_same)
Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.
Code for Two Homologue, Poisson Distribution, Shared DNA Model, in Python

Related

Submit a Comment Cancel reply

Recent Posts

Recent Comments

Archives