Home Installation Walkthrough Pipeline modules Pipeline configuration Plotting tools Community guidelines

FAOM API documentation

foam.maximum_likelihood_estimator

Functions to perform different kinds of maximum likelihood estimation for the models in a grid, and make correlation plots. Note: The file with observations needs to hold temperature as Teff, although the analysis is done using the logTeff values.

  1""" Functions to perform different kinds of maximum likelihood estimation for the models in a grid, and make correlation plots.
  2Note: The file with observations needs to hold temperature as Teff, although the analysis is done using the logTeff values."""
  3
  4import logging
  5import os
  6import sys
  7from pathlib import Path
  8
  9import matplotlib.pyplot as plt
 10import numpy as np
 11import pandas as pd
 12
 13from foam import build_optimised_pattern as bop
 14from foam import support_functions as sf
 15
 16# Make a child logger of "logger" made in the top level script
 17logger = logging.getLogger("logger.mle_estimator")
 18
 19
 20# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 21def calculate_likelihood(
 22    theory_file,
 23    observables=None,
 24    merit_function=None,
 25    obs_path=None,
 26    star_name=None,
 27    fixed_params=None,
 28    grid_parameters=None,
 29):
 30    """
 31    Perform a maximum likelihood estimation using the provided type of merit function on the list of observables.
 32    Writes a data file with the values of the merit function and input parameters of each model.
 33    Can select and continue the analysis of nested grids through the keyword 'fixed_params'.
 34
 35    Parameters
 36    ----------
 37    obs_path: string
 38        Path to the tsv file with observations, with a column for each observable and each set of errors.
 39        Column names specify the observable, and "_err" suffix denotes that it's the error.
 40    theory_file: string
 41        Path to the hdf5 file with the theoretical model input parameters (first set of columns), frequency or period values (last set of columns),
 42        and possibly extra columns with additional observables (these columns should be in between the input parameters and frequency columns).
 43    observables: list of strings
 44        Which observables are included in the merit function.
 45        Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern.
 46        Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
 47    merit_function: string
 48        The type of merit function to use. Currently supports "CS and "MD" ("chi-squared" and "mahalanobis distance").
 49    star_name: string
 50        Name of the star, used in file naming.
 51    fixed_params: dict
 52        Only select and analyse the part of the theoretical grid with the specified parameter values.
 53        The keys specify for which parameters only the specified value should be selected.
 54    grid_parameters: list of string
 55        List of the parameters in the theoretical grid.
 56    """
 57    if "f" in observables:
 58        observed_quantity = "frequency"
 59    elif "P" or "dP" in observables:
 60        observed_quantity = "period"
 61    # Read in the observed data and make an array of the observed observables
 62    obs_dataframe = pd.read_table(obs_path, sep="\s+", header=0, index_col="index")
 63    obs, obs_err, file_suffix_observables = create_obs_observables_array(obs_dataframe, observables)
 64
 65    # set the name of the output file
 66    _, tail = sf.split_line(Path(theory_file).stem, star_name)
 67    filename = f"{star_name}{tail}_{merit_function}_{file_suffix_observables}"
 68
 69    # Theoretical grid data
 70    theory_dataframe = sf.get_subgrid_dataframe(theory_file, fixed_params)
 71
 72    # get the interruptions in the pattern, absolute index in dataframe
 73    missing_absolute = np.where(theory_dataframe.columns.to_series().str.contains(f"{observed_quantity}_missing"))[0]
 74    # get the interruptions in the pattern, index relative within pulsations
 75    missing_relative = np.where(
 76        theory_dataframe.filter(like=f"{observed_quantity}")
 77        .columns.to_series()
 78        .str.contains(f"{observed_quantity}_missing")
 79    )[0]
 80    # Remove columns of missing frequencies
 81    theory_dataframe = theory_dataframe.drop(columns=theory_dataframe.columns[missing_absolute])
 82    # Adjust indices for removed lines of missing frequencies
 83    missing_indices = [missing_relative[i] - i for i in range(len(missing_relative))]
 84
 85    thetas = np.asarray(theory_dataframe.filter(["rot"] + ["rot_err"] + grid_parameters))
 86    theory_puls = np.asarray(theory_dataframe.filter(like=f"{observed_quantity}"))
 87
 88    # Make new list of theoretical models without entries with value -1
 89    new_theory = []
 90    new_thetas = []
 91    for i in range(len(theory_puls)):
 92        # ignore models where one of the freqs is -1
 93        if (theory_puls[i][0] != -1) and (theory_puls[i][-1] != -1):
 94            # make an array of the theoretical observables for each model
 95            theory_observables = create_theory_observables_array(theory_dataframe, i, observables, missing_indices)
 96
 97            new_theory.append(theory_observables)
 98            new_thetas.append(thetas[i])
 99    neg_value = np.unique(np.where(theory_puls == -1)[0])
100    logger.debug(f"File: {theory_file}")
101    logger.debug(f"observables      : {observables}")
102    logger.debug(f"ignored -1 freqs : {len(neg_value)}")
103    logger.debug(f"original #models : {len(theory_puls)}")
104    logger.debug(f"remaining #models: {len(new_theory)}")
105    if len(neg_value) > 0:
106        logger.warning(
107            f"""{len(neg_value)} models were discarded due to mismatches during the selection of theoretical frequencies.
108                        This is likely due to the frequency range used for the theoretical calculations being too narrow."""
109        )
110    new_theory = np.asarray(new_theory)
111    new_thetas = np.asarray(new_thetas)
112
113    # Dictionary containing different merit functions
114    switcher = {"CS": merit_chi2, "MD": merit_mahalanobis}
115
116    # get the desired function from the dictionary. Returns the lambda function if option is not in the dictionary.
117    selected_merit_function = switcher.get(
118        merit_function,
119        lambda x, y, z: sys.exit(logger.error("invalid type of maximum likelihood estimator")),
120    )
121    merit_values = selected_merit_function(obs, obs_err, new_theory, fig_title=f"{filename}", star_name=star_name)
122
123    # Combine values and save the results
124    # add an additional column for MLE 'meritValues'
125    combined_data = np.concatenate((np.matrix(merit_values).T, new_thetas), axis=1)
126    # put the data in a pandas DataFrame
127    df = pd.DataFrame(data=combined_data, columns=["meritValue"] + ["rot"] + ["rot_err"] + grid_parameters)
128    df = pd.merge(
129        df,
130        theory_dataframe.drop(list(theory_dataframe.filter(regex=f"{observed_quantity}").columns), axis=1),
131        how="inner",
132        on=["rot", "rot_err"] + grid_parameters,
133    )
134    df.to_hdf(path_or_buf=f"{os.getcwd()}/meritvalues/{filename}.hdf", key="merit_values", format="table", mode="w")
135
136
137# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
138# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
139def create_theory_observables_array(theory_dataframe, index, observables_in, missing_indices):
140    """
141    Create an array of theoretical observables.
142
143    Parameters
144    ----------
145    theory_dataframe: pandas dataFrame
146        DataFrame containing the theoretical periods or frequencies (as the last columns), along with any additional
147        columns containing extra observables.
148    index: int
149        Row index in the dataFrame of the theoretical model to make the array for.
150    observables_in: list of strings
151        Which observables are included in the returned array.
152        Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern.
153        Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
154    missing_indices: list of int
155        Contains the indices of the missing pulsations so that the period spacing pattern can be split around them.
156
157    Returns
158    ----------
159    observables_out: numpy array, dtype=float
160        The values of the specified observables for the model.
161    """
162    # Make a copy to leave the array handed to this function unaltered.
163    observables = list(observables_in)
164    if "P" in observables:
165        # Cast to list before using asarray to prevent memory leak
166        # add the periods to the output list
167        observables_out = np.asarray(list(theory_dataframe.filter(like="period").loc[index]))
168        observables.remove("P")
169
170    elif "f" in observables:
171        # Cast to list before using asarray to prevent memory leak
172        # add the frequencies to the output list
173        observables_out = np.asarray(list(theory_dataframe.filter(like="frequency").loc[index]))
174        observables.remove("f")
175
176    elif "dP" in observables:
177        periods = np.asarray(theory_dataframe.filter(like="period").loc[index])
178        observables_out = []
179        for periods_part in np.split(periods, missing_indices):
180            spacing, _ = bop.generate_spacing_series(periods_part)
181            # switch back from seconds to days (so both P and dP are in days)
182            observables_out = np.append(observables_out, np.asarray(spacing) / 86400)
183        observables.remove("dP")
184
185    # Add all other observables in the list from the dataFrame
186    for observable in observables:
187        observables_out = np.append(observables_out, np.asarray(theory_dataframe.loc[index, observable]))
188
189    return observables_out
190
191
192# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
193def create_obs_observables_array(obs_dataframe, observables):
194    """
195    Create an array of the observed observables.
196
197    Parameters
198    ----------
199    obs_dataframe: pandas dataFrame
200        DataFrame containing the theoretical frequencies, periods, and any additional observables as columns, as well as columns with their errors.
201        Column names specify the observable, and "_err" suffix denotes that it's the error.
202    observables: list of strings
203        Which observables are included in the returned array.
204        Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern.
205        Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
206
207    Returns
208    ----------
209    observables_out: numpy array, dtype=float
210        The values of the specified observables.
211    observables_err_out: numpy array, dtype=float
212        The errors on the values in observables_out.
213    filename_suffix: string
214        suffix for the filename, containing all the included observables, separated by '-'
215    """
216    # get the interruptions in the pattern
217    missing_indices = np.where(obs_dataframe.index.isin(["f_missing"]))[0]
218    # Adjust indices for removed lines of missing frequencies
219    missing_indices = [missing_indices[i] - i for i in range(len(missing_indices))]
220    # remove lines indicating missing frequencies (if they are present)
221    if len(missing_indices) != 0:
222        obs_dataframe = obs_dataframe.drop(index="f_missing")
223
224    observables = list(observables)  # make a copy of the list, to not alter the one that was given to the function
225    filename_suffix = ""
226
227    if "P" in observables:
228        observables_out = np.asarray(obs_dataframe["period"])
229        observables_err_out = np.asarray(obs_dataframe["period_err"])
230        filename_suffix = "P"
231        observables.remove("P")
232
233    elif "f" in observables:
234        observables_out = np.asarray(obs_dataframe["frequency"])
235        observables_err_out = np.asarray(obs_dataframe["frequency_err"])
236        filename_suffix = "f"
237        observables.remove("f")
238
239    elif "dP" in observables:
240        observables_out = []
241        observables_err_out = []
242        period = np.asarray(obs_dataframe["period"])
243        period_err = np.asarray(obs_dataframe["period_err"])
244        periods_parts = np.split(period, missing_indices)
245        periods_err_parts = np.split(period_err, missing_indices)
246        for periods, periods_err in zip(periods_parts, periods_err_parts):
247            spacing, spacing_errs = bop.generate_spacing_series(periods, periods_err)
248            # switch back from seconds to days (so both P and dP are in days)
249            observables_out = np.append(observables_out, np.asarray(spacing) / 86400)
250            observables_err_out = np.append(observables_err_out, np.asarray(spacing_errs) / 86400)
251
252        filename_suffix = "dP"
253        observables.remove("dP")
254
255    if len(observables) > 0:
256        filename_suffix += "+extra"
257    # Add all other observables in the list from the dataFrame
258    for observable in observables:
259        logTeff = False
260        if observable == "logTeff":
261            # To read it as Teff from the observations datafile
262            observable = "Teff"
263            logTeff = True
264        # since these columns have less entries, the dataFrame has NaNs in the empty rows
265        obs = np.asarray(obs_dataframe[observable])
266        # remove all entries that are NaN in the numpy array
267        obs = obs[~np.isnan(obs)]
268        obs_err = np.asarray(obs_dataframe[f"{observable}_err"])
269        obs_err = obs_err[~np.isnan(obs_err)]
270
271        # Convert observed Teff and error to log values
272        if logTeff:
273            logT = np.log10(obs)
274            logT_err = obs_err / (obs * np.log(10))
275            observables_out = np.append(observables_out, logT)
276            observables_err_out = np.append(observables_err_out, logT_err)
277        else:
278            observables_out = np.append(observables_out, obs)
279            observables_err_out = np.append(observables_err_out, obs_err)
280
281    return observables_out, observables_err_out, filename_suffix
282
283
284# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
285# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
286def merit_chi2(Yobs, obs_err, YTheo, fig_title=None, star_name=None):
287    """
288    Calculate chi squared values for the given theoretical patterns
289
290    Parameters
291    ----------
292    Yobs: numpy array, dtype=float
293        observed values (period or frequency)
294    obs_err: numpy array, dtype=float
295        Errors on Yobs
296    YTheo: numpy ndarray (2D), dtype=float
297        Array of all theoretical patterns to calculate the chi squared value for.
298    fig_title: None
299        Should not be used in this function, but is to make it analogous to merit_mahalanobis()
300        and enable the use of the lambda function.
301    star_name: None
302        Should not be used in this function, but is to make it analogous to merit_mahalanobis()
303        and enable the use of the lambda function.
304    Returns
305    ----------
306    chi2: numpy array, dtype=float
307        chi squared values for the given theoretical values
308    """
309    chi2 = np.array([np.sum(((one_YTheo - Yobs) / obs_err) ** 2) for one_YTheo in YTheo])
310    return chi2
311
312
313# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
314def merit_mahalanobis(Yobs, obs_err, YTheo, generate_output=True, fig_title=None, star_name=None):
315    """
316    Calculate mahalanobis distance (MD) values for the given theoretical patterns.
317
318    Parameters
319    ----------
320    Yobs: numpy array, dtype=float
321        observed values (period or frequency)
322    obs_err: numpy array, dtype=float
323        Errors on Yobs
324    YTheo: numpy array of arrays of floats
325        Array of all theoretical patterns to calculate the MD value for.
326    generate_output: boolean
327        Flag to write output and plot the variance-covariance matrix
328    fig_title: string
329        The name of the figure to be created.
330    star_name: string
331        The name of the analysed star, for file naming purposes.
332
333    Returns
334    ----------
335    MD: numpy array, dtype=float
336        Mahalanobis distances for the given theoretical patterns.
337    """
338    # Convert to matrix format (np.matrix is not recommended, use array and newaxis instead)
339    YobsMat = np.array(Yobs)[np.newaxis].T
340    YTheoMat = np.array(YTheo)[np.newaxis].T
341
342    # Calculate the average on the theoretical values (e.g. frequencies)
343    # over the entire grid. Returns a vector of dimension N x 1
344    Yav = YTheoMat.mean(1)
345
346    # Calculate the variance-covariance matrix
347    # number of grid points
348    q = np.shape(YTheo)[0]
349    # number of observed values
350    N = len(Yobs)
351    v_matrix = np.zeros((N, N))
352    for i in range(q):
353        difference = np.subtract(YTheoMat[:, i], Yav)
354        v_matrix += np.matmul(difference, difference.T)
355    v_matrix = v_matrix / float(q - 1)
356
357    # Include observational errors in the variance-covariance matrix
358    v_matrix = v_matrix + np.diag(obs_err**2.0)
359    # check if positive definite and make figure
360    check_matrix(v_matrix, generate_output=generate_output, fig_title=fig_title, star_name=star_name)
361    # Calculate Mahalanobis distances
362    MD = np.zeros(q)
363    v_matrix_inv = np.linalg.inv(v_matrix)
364    for i in range(q):
365        diff = YTheoMat[:, i] - YobsMat
366        MD[i] = np.matmul(np.matmul(diff.T, v_matrix_inv), diff)[0][0]
367
368    return MD
369
370
371################################################################################
372def check_matrix(v_matrix, generate_output=True, fig_title="Vmatrix", star_name=None):
373    """
374    Check the if the the eigenvalues of the Variance-covariance matrix are all positive,
375    since this means the matrix is positive definite. Compute its determinant and condition number,
376    and write them to a tsv file. Create and save a figure of the variance-covariance matrix.
377
378    Parameters
379    ----------
380    v_matrix: numpy ndarray (2D), dtype=float
381        Variance-covariance matrix
382    output: boolean
383        Flag to write output and plot the variance-covariance matrix
384    fig_title: string
385        The name of the figure to be created.
386    star_name: string
387        The name of the analysed star, for file naming purposes.
388    """
389    # If all eigenvalues are >0, it is positive definite
390    if np.all(np.linalg.eigvals(v_matrix) > 0) == False:
391        logger.error("V matrix is possibly not positive definite (since eigenvalues are not all > 0)")
392        sys.exit(1)
393    logger.debug(f"max(v_matrix) = {np.max(v_matrix)}")
394    # multiply the matrix by the exponent of this, otherwise the determinant can be too small for the numerics
395    kk = 10
396
397    if generate_output is True:
398        file_path = Path(f"{os.getcwd()}/V_matrix/{star_name}_determinant_conditionNr.tsv")
399        file_path.parent.mkdir(parents=True, exist_ok=True)
400        if not file_path.is_file():
401            with file_path.open("w") as file:
402                file.write(f"method \t ln(det(V)) \t condition_number \n")
403        with file_path.open("a") as file:
404            file.write(
405                f"{fig_title} \t {np.log(np.linalg.det(np.exp(kk)*v_matrix))-kk*v_matrix.shape[0]:.2f} \t {np.linalg.cond(v_matrix):.2f} \n "
406            )
407
408        # Do *10^4 to get rid of small values, and put this in the colorbar label
409        im = plt.imshow(v_matrix * 10**4, aspect="auto", cmap="Reds")
410        plt.ylabel(rf"obs {v_matrix.shape[0]}  $\leftarrow$ obs 1", size=14)
411        plt.xlabel(rf"obs 1 $\rightarrow$ obs {v_matrix.shape[0]} ", size=14)
412
413        cbar = plt.colorbar(im)
414        cbar.ax.set_ylabel(r"[d$^{2} 10^{-4}$]", rotation=90, labelpad=15, size=14)
415        cbar.ax.tick_params(labelsize=14)
416        plt.title("Variance covariance matrix")
417        plt.tick_params(labelsize=14)
418        plt.tight_layout()
419        plt.savefig(f"{os.getcwd()}/V_matrix/{fig_title}.png")
420        plt.clf()
421        plt.close("all")
def calculate_likelihood( theory_file, observables=None, merit_function=None, obs_path=None, star_name=None, fixed_params=None, grid_parameters=None):
 22def calculate_likelihood(
 23    theory_file,
 24    observables=None,
 25    merit_function=None,
 26    obs_path=None,
 27    star_name=None,
 28    fixed_params=None,
 29    grid_parameters=None,
 30):
 31    """
 32    Perform a maximum likelihood estimation using the provided type of merit function on the list of observables.
 33    Writes a data file with the values of the merit function and input parameters of each model.
 34    Can select and continue the analysis of nested grids through the keyword 'fixed_params'.
 35
 36    Parameters
 37    ----------
 38    obs_path: string
 39        Path to the tsv file with observations, with a column for each observable and each set of errors.
 40        Column names specify the observable, and "_err" suffix denotes that it's the error.
 41    theory_file: string
 42        Path to the hdf5 file with the theoretical model input parameters (first set of columns), frequency or period values (last set of columns),
 43        and possibly extra columns with additional observables (these columns should be in between the input parameters and frequency columns).
 44    observables: list of strings
 45        Which observables are included in the merit function.
 46        Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern.
 47        Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
 48    merit_function: string
 49        The type of merit function to use. Currently supports "CS and "MD" ("chi-squared" and "mahalanobis distance").
 50    star_name: string
 51        Name of the star, used in file naming.
 52    fixed_params: dict
 53        Only select and analyse the part of the theoretical grid with the specified parameter values.
 54        The keys specify for which parameters only the specified value should be selected.
 55    grid_parameters: list of string
 56        List of the parameters in the theoretical grid.
 57    """
 58    if "f" in observables:
 59        observed_quantity = "frequency"
 60    elif "P" or "dP" in observables:
 61        observed_quantity = "period"
 62    # Read in the observed data and make an array of the observed observables
 63    obs_dataframe = pd.read_table(obs_path, sep="\s+", header=0, index_col="index")
 64    obs, obs_err, file_suffix_observables = create_obs_observables_array(obs_dataframe, observables)
 65
 66    # set the name of the output file
 67    _, tail = sf.split_line(Path(theory_file).stem, star_name)
 68    filename = f"{star_name}{tail}_{merit_function}_{file_suffix_observables}"
 69
 70    # Theoretical grid data
 71    theory_dataframe = sf.get_subgrid_dataframe(theory_file, fixed_params)
 72
 73    # get the interruptions in the pattern, absolute index in dataframe
 74    missing_absolute = np.where(theory_dataframe.columns.to_series().str.contains(f"{observed_quantity}_missing"))[0]
 75    # get the interruptions in the pattern, index relative within pulsations
 76    missing_relative = np.where(
 77        theory_dataframe.filter(like=f"{observed_quantity}")
 78        .columns.to_series()
 79        .str.contains(f"{observed_quantity}_missing")
 80    )[0]
 81    # Remove columns of missing frequencies
 82    theory_dataframe = theory_dataframe.drop(columns=theory_dataframe.columns[missing_absolute])
 83    # Adjust indices for removed lines of missing frequencies
 84    missing_indices = [missing_relative[i] - i for i in range(len(missing_relative))]
 85
 86    thetas = np.asarray(theory_dataframe.filter(["rot"] + ["rot_err"] + grid_parameters))
 87    theory_puls = np.asarray(theory_dataframe.filter(like=f"{observed_quantity}"))
 88
 89    # Make new list of theoretical models without entries with value -1
 90    new_theory = []
 91    new_thetas = []
 92    for i in range(len(theory_puls)):
 93        # ignore models where one of the freqs is -1
 94        if (theory_puls[i][0] != -1) and (theory_puls[i][-1] != -1):
 95            # make an array of the theoretical observables for each model
 96            theory_observables = create_theory_observables_array(theory_dataframe, i, observables, missing_indices)
 97
 98            new_theory.append(theory_observables)
 99            new_thetas.append(thetas[i])
100    neg_value = np.unique(np.where(theory_puls == -1)[0])
101    logger.debug(f"File: {theory_file}")
102    logger.debug(f"observables      : {observables}")
103    logger.debug(f"ignored -1 freqs : {len(neg_value)}")
104    logger.debug(f"original #models : {len(theory_puls)}")
105    logger.debug(f"remaining #models: {len(new_theory)}")
106    if len(neg_value) > 0:
107        logger.warning(
108            f"""{len(neg_value)} models were discarded due to mismatches during the selection of theoretical frequencies.
109                        This is likely due to the frequency range used for the theoretical calculations being too narrow."""
110        )
111    new_theory = np.asarray(new_theory)
112    new_thetas = np.asarray(new_thetas)
113
114    # Dictionary containing different merit functions
115    switcher = {"CS": merit_chi2, "MD": merit_mahalanobis}
116
117    # get the desired function from the dictionary. Returns the lambda function if option is not in the dictionary.
118    selected_merit_function = switcher.get(
119        merit_function,
120        lambda x, y, z: sys.exit(logger.error("invalid type of maximum likelihood estimator")),
121    )
122    merit_values = selected_merit_function(obs, obs_err, new_theory, fig_title=f"{filename}", star_name=star_name)
123
124    # Combine values and save the results
125    # add an additional column for MLE 'meritValues'
126    combined_data = np.concatenate((np.matrix(merit_values).T, new_thetas), axis=1)
127    # put the data in a pandas DataFrame
128    df = pd.DataFrame(data=combined_data, columns=["meritValue"] + ["rot"] + ["rot_err"] + grid_parameters)
129    df = pd.merge(
130        df,
131        theory_dataframe.drop(list(theory_dataframe.filter(regex=f"{observed_quantity}").columns), axis=1),
132        how="inner",
133        on=["rot", "rot_err"] + grid_parameters,
134    )
135    df.to_hdf(path_or_buf=f"{os.getcwd()}/meritvalues/{filename}.hdf", key="merit_values", format="table", mode="w")

Perform a maximum likelihood estimation using the provided type of merit function on the list of observables. Writes a data file with the values of the merit function and input parameters of each model. Can select and continue the analysis of nested grids through the keyword 'fixed_params'.

Parameters
  • obs_path (string): Path to the tsv file with observations, with a column for each observable and each set of errors. Column names specify the observable, and "_err" suffix denotes that it's the error.
  • theory_file (string): Path to the hdf5 file with the theoretical model input parameters (first set of columns), frequency or period values (last set of columns), and possibly extra columns with additional observables (these columns should be in between the input parameters and frequency columns).
  • observables (list of strings): Which observables are included in the merit function. Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern. Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
  • merit_function (string): The type of merit function to use. Currently supports "CS and "MD" ("chi-squared" and "mahalanobis distance").
  • star_name (string): Name of the star, used in file naming.
  • fixed_params (dict): Only select and analyse the part of the theoretical grid with the specified parameter values. The keys specify for which parameters only the specified value should be selected.
  • grid_parameters (list of string): List of the parameters in the theoretical grid.
def create_theory_observables_array(theory_dataframe, index, observables_in, missing_indices):
140def create_theory_observables_array(theory_dataframe, index, observables_in, missing_indices):
141    """
142    Create an array of theoretical observables.
143
144    Parameters
145    ----------
146    theory_dataframe: pandas dataFrame
147        DataFrame containing the theoretical periods or frequencies (as the last columns), along with any additional
148        columns containing extra observables.
149    index: int
150        Row index in the dataFrame of the theoretical model to make the array for.
151    observables_in: list of strings
152        Which observables are included in the returned array.
153        Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern.
154        Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
155    missing_indices: list of int
156        Contains the indices of the missing pulsations so that the period spacing pattern can be split around them.
157
158    Returns
159    ----------
160    observables_out: numpy array, dtype=float
161        The values of the specified observables for the model.
162    """
163    # Make a copy to leave the array handed to this function unaltered.
164    observables = list(observables_in)
165    if "P" in observables:
166        # Cast to list before using asarray to prevent memory leak
167        # add the periods to the output list
168        observables_out = np.asarray(list(theory_dataframe.filter(like="period").loc[index]))
169        observables.remove("P")
170
171    elif "f" in observables:
172        # Cast to list before using asarray to prevent memory leak
173        # add the frequencies to the output list
174        observables_out = np.asarray(list(theory_dataframe.filter(like="frequency").loc[index]))
175        observables.remove("f")
176
177    elif "dP" in observables:
178        periods = np.asarray(theory_dataframe.filter(like="period").loc[index])
179        observables_out = []
180        for periods_part in np.split(periods, missing_indices):
181            spacing, _ = bop.generate_spacing_series(periods_part)
182            # switch back from seconds to days (so both P and dP are in days)
183            observables_out = np.append(observables_out, np.asarray(spacing) / 86400)
184        observables.remove("dP")
185
186    # Add all other observables in the list from the dataFrame
187    for observable in observables:
188        observables_out = np.append(observables_out, np.asarray(theory_dataframe.loc[index, observable]))
189
190    return observables_out

Create an array of theoretical observables.

Parameters
  • theory_dataframe (pandas dataFrame): DataFrame containing the theoretical periods or frequencies (as the last columns), along with any additional columns containing extra observables.
  • index (int): Row index in the dataFrame of the theoretical model to make the array for.
  • observables_in (list of strings): Which observables are included in the returned array. Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern. Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
  • missing_indices (list of int): Contains the indices of the missing pulsations so that the period spacing pattern can be split around them.
Returns
  • observables_out (numpy array, dtype=float): The values of the specified observables for the model.
def create_obs_observables_array(obs_dataframe, observables):
194def create_obs_observables_array(obs_dataframe, observables):
195    """
196    Create an array of the observed observables.
197
198    Parameters
199    ----------
200    obs_dataframe: pandas dataFrame
201        DataFrame containing the theoretical frequencies, periods, and any additional observables as columns, as well as columns with their errors.
202        Column names specify the observable, and "_err" suffix denotes that it's the error.
203    observables: list of strings
204        Which observables are included in the returned array.
205        Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern.
206        Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
207
208    Returns
209    ----------
210    observables_out: numpy array, dtype=float
211        The values of the specified observables.
212    observables_err_out: numpy array, dtype=float
213        The errors on the values in observables_out.
214    filename_suffix: string
215        suffix for the filename, containing all the included observables, separated by '-'
216    """
217    # get the interruptions in the pattern
218    missing_indices = np.where(obs_dataframe.index.isin(["f_missing"]))[0]
219    # Adjust indices for removed lines of missing frequencies
220    missing_indices = [missing_indices[i] - i for i in range(len(missing_indices))]
221    # remove lines indicating missing frequencies (if they are present)
222    if len(missing_indices) != 0:
223        obs_dataframe = obs_dataframe.drop(index="f_missing")
224
225    observables = list(observables)  # make a copy of the list, to not alter the one that was given to the function
226    filename_suffix = ""
227
228    if "P" in observables:
229        observables_out = np.asarray(obs_dataframe["period"])
230        observables_err_out = np.asarray(obs_dataframe["period_err"])
231        filename_suffix = "P"
232        observables.remove("P")
233
234    elif "f" in observables:
235        observables_out = np.asarray(obs_dataframe["frequency"])
236        observables_err_out = np.asarray(obs_dataframe["frequency_err"])
237        filename_suffix = "f"
238        observables.remove("f")
239
240    elif "dP" in observables:
241        observables_out = []
242        observables_err_out = []
243        period = np.asarray(obs_dataframe["period"])
244        period_err = np.asarray(obs_dataframe["period_err"])
245        periods_parts = np.split(period, missing_indices)
246        periods_err_parts = np.split(period_err, missing_indices)
247        for periods, periods_err in zip(periods_parts, periods_err_parts):
248            spacing, spacing_errs = bop.generate_spacing_series(periods, periods_err)
249            # switch back from seconds to days (so both P and dP are in days)
250            observables_out = np.append(observables_out, np.asarray(spacing) / 86400)
251            observables_err_out = np.append(observables_err_out, np.asarray(spacing_errs) / 86400)
252
253        filename_suffix = "dP"
254        observables.remove("dP")
255
256    if len(observables) > 0:
257        filename_suffix += "+extra"
258    # Add all other observables in the list from the dataFrame
259    for observable in observables:
260        logTeff = False
261        if observable == "logTeff":
262            # To read it as Teff from the observations datafile
263            observable = "Teff"
264            logTeff = True
265        # since these columns have less entries, the dataFrame has NaNs in the empty rows
266        obs = np.asarray(obs_dataframe[observable])
267        # remove all entries that are NaN in the numpy array
268        obs = obs[~np.isnan(obs)]
269        obs_err = np.asarray(obs_dataframe[f"{observable}_err"])
270        obs_err = obs_err[~np.isnan(obs_err)]
271
272        # Convert observed Teff and error to log values
273        if logTeff:
274            logT = np.log10(obs)
275            logT_err = obs_err / (obs * np.log(10))
276            observables_out = np.append(observables_out, logT)
277            observables_err_out = np.append(observables_err_out, logT_err)
278        else:
279            observables_out = np.append(observables_out, obs)
280            observables_err_out = np.append(observables_err_out, obs_err)
281
282    return observables_out, observables_err_out, filename_suffix

Create an array of the observed observables.

Parameters
  • obs_dataframe (pandas dataFrame): DataFrame containing the theoretical frequencies, periods, and any additional observables as columns, as well as columns with their errors. Column names specify the observable, and "_err" suffix denotes that it's the error.
  • observables (list of strings): Which observables are included in the returned array. Must contain either 'f' (frequency), 'P' (period), or 'dP' (period-spacing) which will be computed for the period pattern. Can contain any additional observables that are added as columns in both the file with observations and the file with theoretical models.
Returns
  • observables_out (numpy array, dtype=float): The values of the specified observables.
  • observables_err_out (numpy array, dtype=float): The errors on the values in observables_out.
  • filename_suffix (string): suffix for the filename, containing all the included observables, separated by '-'
def merit_chi2(Yobs, obs_err, YTheo, fig_title=None, star_name=None):
287def merit_chi2(Yobs, obs_err, YTheo, fig_title=None, star_name=None):
288    """
289    Calculate chi squared values for the given theoretical patterns
290
291    Parameters
292    ----------
293    Yobs: numpy array, dtype=float
294        observed values (period or frequency)
295    obs_err: numpy array, dtype=float
296        Errors on Yobs
297    YTheo: numpy ndarray (2D), dtype=float
298        Array of all theoretical patterns to calculate the chi squared value for.
299    fig_title: None
300        Should not be used in this function, but is to make it analogous to merit_mahalanobis()
301        and enable the use of the lambda function.
302    star_name: None
303        Should not be used in this function, but is to make it analogous to merit_mahalanobis()
304        and enable the use of the lambda function.
305    Returns
306    ----------
307    chi2: numpy array, dtype=float
308        chi squared values for the given theoretical values
309    """
310    chi2 = np.array([np.sum(((one_YTheo - Yobs) / obs_err) ** 2) for one_YTheo in YTheo])
311    return chi2

Calculate chi squared values for the given theoretical patterns

Parameters
  • Yobs (numpy array, dtype=float): observed values (period or frequency)
  • obs_err (numpy array, dtype=float): Errors on Yobs
  • YTheo (numpy ndarray (2D), dtype=float): Array of all theoretical patterns to calculate the chi squared value for.
  • fig_title (None): Should not be used in this function, but is to make it analogous to merit_mahalanobis() and enable the use of the lambda function.
  • star_name (None): Should not be used in this function, but is to make it analogous to merit_mahalanobis() and enable the use of the lambda function.
Returns
  • chi2 (numpy array, dtype=float): chi squared values for the given theoretical values
def merit_mahalanobis( Yobs, obs_err, YTheo, generate_output=True, fig_title=None, star_name=None):
315def merit_mahalanobis(Yobs, obs_err, YTheo, generate_output=True, fig_title=None, star_name=None):
316    """
317    Calculate mahalanobis distance (MD) values for the given theoretical patterns.
318
319    Parameters
320    ----------
321    Yobs: numpy array, dtype=float
322        observed values (period or frequency)
323    obs_err: numpy array, dtype=float
324        Errors on Yobs
325    YTheo: numpy array of arrays of floats
326        Array of all theoretical patterns to calculate the MD value for.
327    generate_output: boolean
328        Flag to write output and plot the variance-covariance matrix
329    fig_title: string
330        The name of the figure to be created.
331    star_name: string
332        The name of the analysed star, for file naming purposes.
333
334    Returns
335    ----------
336    MD: numpy array, dtype=float
337        Mahalanobis distances for the given theoretical patterns.
338    """
339    # Convert to matrix format (np.matrix is not recommended, use array and newaxis instead)
340    YobsMat = np.array(Yobs)[np.newaxis].T
341    YTheoMat = np.array(YTheo)[np.newaxis].T
342
343    # Calculate the average on the theoretical values (e.g. frequencies)
344    # over the entire grid. Returns a vector of dimension N x 1
345    Yav = YTheoMat.mean(1)
346
347    # Calculate the variance-covariance matrix
348    # number of grid points
349    q = np.shape(YTheo)[0]
350    # number of observed values
351    N = len(Yobs)
352    v_matrix = np.zeros((N, N))
353    for i in range(q):
354        difference = np.subtract(YTheoMat[:, i], Yav)
355        v_matrix += np.matmul(difference, difference.T)
356    v_matrix = v_matrix / float(q - 1)
357
358    # Include observational errors in the variance-covariance matrix
359    v_matrix = v_matrix + np.diag(obs_err**2.0)
360    # check if positive definite and make figure
361    check_matrix(v_matrix, generate_output=generate_output, fig_title=fig_title, star_name=star_name)
362    # Calculate Mahalanobis distances
363    MD = np.zeros(q)
364    v_matrix_inv = np.linalg.inv(v_matrix)
365    for i in range(q):
366        diff = YTheoMat[:, i] - YobsMat
367        MD[i] = np.matmul(np.matmul(diff.T, v_matrix_inv), diff)[0][0]
368
369    return MD

Calculate mahalanobis distance (MD) values for the given theoretical patterns.

Parameters
  • Yobs (numpy array, dtype=float): observed values (period or frequency)
  • obs_err (numpy array, dtype=float): Errors on Yobs
  • YTheo (numpy array of arrays of floats): Array of all theoretical patterns to calculate the MD value for.
  • generate_output (boolean): Flag to write output and plot the variance-covariance matrix
  • fig_title (string): The name of the figure to be created.
  • star_name (string): The name of the analysed star, for file naming purposes.
Returns
  • MD (numpy array, dtype=float): Mahalanobis distances for the given theoretical patterns.
def check_matrix(v_matrix, generate_output=True, fig_title='Vmatrix', star_name=None):
373def check_matrix(v_matrix, generate_output=True, fig_title="Vmatrix", star_name=None):
374    """
375    Check the if the the eigenvalues of the Variance-covariance matrix are all positive,
376    since this means the matrix is positive definite. Compute its determinant and condition number,
377    and write them to a tsv file. Create and save a figure of the variance-covariance matrix.
378
379    Parameters
380    ----------
381    v_matrix: numpy ndarray (2D), dtype=float
382        Variance-covariance matrix
383    output: boolean
384        Flag to write output and plot the variance-covariance matrix
385    fig_title: string
386        The name of the figure to be created.
387    star_name: string
388        The name of the analysed star, for file naming purposes.
389    """
390    # If all eigenvalues are >0, it is positive definite
391    if np.all(np.linalg.eigvals(v_matrix) > 0) == False:
392        logger.error("V matrix is possibly not positive definite (since eigenvalues are not all > 0)")
393        sys.exit(1)
394    logger.debug(f"max(v_matrix) = {np.max(v_matrix)}")
395    # multiply the matrix by the exponent of this, otherwise the determinant can be too small for the numerics
396    kk = 10
397
398    if generate_output is True:
399        file_path = Path(f"{os.getcwd()}/V_matrix/{star_name}_determinant_conditionNr.tsv")
400        file_path.parent.mkdir(parents=True, exist_ok=True)
401        if not file_path.is_file():
402            with file_path.open("w") as file:
403                file.write(f"method \t ln(det(V)) \t condition_number \n")
404        with file_path.open("a") as file:
405            file.write(
406                f"{fig_title} \t {np.log(np.linalg.det(np.exp(kk)*v_matrix))-kk*v_matrix.shape[0]:.2f} \t {np.linalg.cond(v_matrix):.2f} \n "
407            )
408
409        # Do *10^4 to get rid of small values, and put this in the colorbar label
410        im = plt.imshow(v_matrix * 10**4, aspect="auto", cmap="Reds")
411        plt.ylabel(rf"obs {v_matrix.shape[0]}  $\leftarrow$ obs 1", size=14)
412        plt.xlabel(rf"obs 1 $\rightarrow$ obs {v_matrix.shape[0]} ", size=14)
413
414        cbar = plt.colorbar(im)
415        cbar.ax.set_ylabel(r"[d$^{2} 10^{-4}$]", rotation=90, labelpad=15, size=14)
416        cbar.ax.tick_params(labelsize=14)
417        plt.title("Variance covariance matrix")
418        plt.tick_params(labelsize=14)
419        plt.tight_layout()
420        plt.savefig(f"{os.getcwd()}/V_matrix/{fig_title}.png")
421        plt.clf()
422        plt.close("all")

Check the if the the eigenvalues of the Variance-covariance matrix are all positive, since this means the matrix is positive definite. Compute its determinant and condition number, and write them to a tsv file. Create and save a figure of the variance-covariance matrix.

Parameters
  • v_matrix (numpy ndarray (2D), dtype=float): Variance-covariance matrix
  • output (boolean): Flag to write output and plot the variance-covariance matrix
  • fig_title (string): The name of the figure to be created.
  • star_name (string): The name of the analysed star, for file naming purposes.