# Copyright (c) 2020 Brit Nicholson.
# All rights reserved. This program is made available under
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at
# https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt

# Training model for determining the average number of "segments" passed from a simulated
# parent to a child based on the known standard deviation between siblings.

import numpy as np
import random
import math
import pandas as pd

# The below function allows a "child" to "inherit" 50% of each of his/her parents' DNA
# The halved # of segments that a parent has available is passed as the 1st arg.
# The labeled "DNA segments" of the first parent are passed as the 2nd arg.
# An optional argument is the labeled segments of the 2nd parent. This is always used here.
def recomb(fun_half_segs, first_anc, second_anc=None):

  # Half of the 1st parent's segments are passed
  out = random.sample(first_anc, fun_half_segs)

  if second_anc != None:
    # Half of the 2nd parent's segments are passed (added to the list)
    out.extend(random.sample(second_anc, fun_half_segs))
  return out

# Initialize the number of segments
segs = 96.95

# Assuming std_dev is just under 0.036.
# Doesn't really matter if it's + or - at first. It will correct itself.
difference = -0.0001 

segs_out = []
mean_out = []
std_dev_out = []

# The simulation appears to keep running with the value: 0.000001.
# With one less zero it would sometimes exit the while loop
# when the number of segments was around 97.5
while abs(difference) > 0.000001:

  if difference > 0:
    # If std_dev is high, the # of segs. needs to be increased
    # Got too much variability with 0.001, even w/ 200,401 sib_pairs
    segs += 0.05
  else:
    # If std_dev is low, the # of segs. needs to be decreased
    segs -= 0.05

  percent_same = []

  # This is how many trials will be done to compair a pair of siblings
  sib_pairs = 200401
  for i in range(0, sib_pairs):

    # Only the decimal portion remaining after halving the segments.
    # This allows #s other than even whole integer #s of segments.
    seg_decimal = segs/2 - int(segs/2)

    if np.random.random() > seg_decimal:
      # If the decimal is 0.75, round down the halved # of segments 25% of the time.
      half_segs = math.floor(segs/2)
    else:
      # If the decimal is 0.75, round up the halved # of segments 75% of the time.
      half_segs = math.ceil(segs/2)

    # Make labeled "segments" for maternal and paternal ancestors (parents here):
    m = list(range(0, half_segs*2))
    p = list(m)

    m = ['m' + str(i) for i in m]
    p = ['p' + str(i) for i in p]
  
    # Call the function for the first sibling
    s1 = recomb(half_segs, m, p)
  
    # Call the function for the second sibling
    s2 = recomb(half_segs, m, p)
  
    # Take the unique intersection (shared segments) of the siblings
    in_common = list(set(s1) & set(s2))
  
    # Calculate the fraction of segments per sibling that is shared
    percent_same.append(len(in_common)/(2*half_segs))

  # Find the standard deviation of all of the shared fractions
  std_dev = np.std(percent_same)
    
  # Find if the model overshot or undershot, for future correction
  difference = std_dev - 0.036

  segs_out.append(segs)
  mean_out.append(np.mean(percent_same))
  std_dev_out.append(std_dev)
  print(segs)
  print(std_dev)
  #print(min(percent_same))
  #print(max(percent_same))

Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.