Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | 9x 9x 9x 9x 9x 9x 9x 9x 9x 9x 9x 9x 22x 4x 9x 14x 14x 14x 8x 8x 6x 4x 4x 2x 2x 30x 30x 20x 10x 2x 2x 20x 17x 20x 9x 20x 2x 1x 1x 14x 14x 14x 14x 22x 2x 20x 2x 2x 2x 18x 20x 1x 19x 1x 18x 18x 4x 14x 14x 14x 12x 12x 2x 2x 10x 12x 12x 12x 12x 12x 14x | const naturalAminoAcids = 'ARNDCEQGHILKMFPSTWYV';
const ambiguousAminoAcids = 'XBZJ';
const baseNucleicAcids = 'ACGTU';
const ambiguousNucleicAcids = 'WSMKRYBDHVNZ';
// Characters unique only to their subset e.g. 'U' is not a valid AA.
const aminoAcidsOnly = /[EQILFPXJ]/gi;
const nucleicAcidsOnly = /U/gi;
export const validAminoAcids = [
naturalAminoAcids,
ambiguousAminoAcids,
'*', // Stop
'.-', // Gaps
].join('');
export const validNucleicAcids = [
baseNucleicAcids,
ambiguousNucleicAcids,
// No gaps e.g. - and . are not allowed
].join('');
export const errorResponses = Object.freeze({
missingSequence: Object.freeze({
valid: false,
message: 'The sequence is missing',
}),
invalidSequence: Object.freeze({
valid: false,
message: 'The sequence is invalid',
}),
shortSequence: Object.freeze({
valid: false,
message: 'The sequence is too short',
}),
});
export const validResponse = Object.freeze({ valid: true });
// Keep start ^ and end $ anchors in the regex
// Matches all alphabet letters, except for the letter O (case-insensitive)
// Accepts * . -
const validCharacters = /^[A-NP-Z*.-]+$/i;
const validCharactersUnderStrictMode = new RegExp(
`^[${naturalAminoAcids}*.-]+$`,
'i'
);
/**
* Very basic check on if a string is likely to be a FASTA-formatted string
*
* @param {string} seq - Sequence
* @return {Boolean} True if it is likely to be FASTA
*/
const isFASTA = (seq: string) => /.*[>;]+/gm.test(seq);
/**
* Prepares a string to be digested by the core validation function.
*
* @param {string} fasta - A FASTA-formatted string
* @return {Array<string>} An array of clean sequences
*/
function prepareFASTAString(fasta: string) {
return fasta
.split(/^[>;].*\n?$/gm) // split and remove the 'Description' line
.map((s) => s.replace(/\s/g, '')) // remove all of the white-space
.filter(Boolean); // remove all non-truthy values e.g. null, '', false.
}
/*
* Accepts a sequence of characters and makes a guess if
* the sequence is an AA or NA sequence.
* Note: This function will always return a guess.
* Note: The argument 'threshold' is the frequency of nucliec-acid
* bases that has to be repeated in the sequence in order to consider
* it a NA sequence. This is a percentage value, represented in
* values from 1 to 100.
* e.g. with 'threshold' set to 90
* sample sequence: AAAAAZZZZZ -> AA
* sample sequence: AAAAAAAAAZ -> NA
*
* @param {string} sequence - A sequence
* @param {Number} threshold - NA threshold from 1 to 100
* @return {string} The likely type either 'aa' or 'na'
*/
function guessSequenceType(sequence: string, threshold: number) {
const typeAA = 'aa';
const typeNA = 'na';
// Check for nucleic-acid only characters
if (nucleicAcidsOnly.test(sequence)) {
// Make sure it does NOT contain any amino-acid only characters
Eif (!aminoAcidsOnly.test(sequence)) {
return typeNA;
}
}
// Check for amino-acid only characters
if (aminoAcidsOnly.test(sequence)) {
// Make sure it does NOT contain any nucleic-acid only characters
Eif (!nucleicAcidsOnly.test(sequence)) {
return typeAA;
}
}
// Counting the characters to determine the percentage of
// total occurances of members of each type
const counts: Record<string, number> = {};
for (let index = 0; index < sequence.length; index += 1) {
const char = sequence.charAt(index);
if (!counts[char]) {
counts[char] = 1;
} else {
counts[char] += 1;
}
}
// Keeping track of each individual character's percentage in the
// whole sequence, gives flexibility if further logic needs to be
// utilised in the future.
const percentagePerChar = 100 / sequence.length;
const percentages = Object.keys(counts).reduce(
(acc, current) => {
if (naturalAminoAcids.includes(current)) {
acc.aa += counts[current] * percentagePerChar;
}
if (baseNucleicAcids.includes(current)) {
acc.na += counts[current] * percentagePerChar;
}
return acc;
},
{
aa: 0, // Amino-Acids
na: 0, // Nucleic-Acids
}
);
// Is this above our arbitrary threshold?
if (percentages.na > threshold) {
return typeNA;
}
// If you have reached here and it is not NA, then our guess
// would be AA.
return typeAA;
}
/*
* Does some ground work before passing the sequence to the
* 'guess' function.
*
* @param {string} sequence - A sequence
* @return {string} The likely type either 'aa' or 'na'
*/
export function findLikelyType(sequence: string) {
// 1. Remove all of the non-letter characters, plus N and X
// 2. If less than 11 usable characters left: unable to guess
// 3. If more than (by default) 90% ACGTU: Nucleic-Acids
// 4. Else: Amino-Acids
// Note: Removing N and X is fine because 'N' is valid for both AA and NA sequences,
// therefore, removing it should equally reduce the sequence length, regardless of the
// type of sequence. 'X' can be safely removed since it only exists in AA sequences
// and not a valid character in NA sequences.
const cleanUpRegEx = /[^A-Z]|[NX]/gi;
const cleanSequence = sequence.replace(cleanUpRegEx, '').toUpperCase();
const nucleicAcidBaseThreshold = 90;
return guessSequenceType(cleanSequence, nucleicAcidBaseThreshold);
}
/**
* Core internal validation function for a single sequence
*
* @param {string} sequence - A sequence
* @return {object} The result
*/
export function sequenceValidator(
sequence: string,
minimumLength?: number,
strict?: boolean
) {
// Sequence was not passed at all
if (!sequence) {
return errorResponses.missingSequence;
}
// Remove all white-spaces and FASTA bits
let cleanSequence: string | undefined;
if (isFASTA(sequence)) {
const cleanSequences = prepareFASTAString(sequence);
Eif (cleanSequences.length > 0) {
[cleanSequence] = cleanSequences;
}
} else {
cleanSequence = sequence.replace(/\s/g, '');
}
// Nothing left?
if (!cleanSequence) {
return errorResponses.missingSequence;
}
if (minimumLength && cleanSequence.length < minimumLength) {
return errorResponses.shortSequence;
}
// Check and fail if there are any invalid characters in the sequence
const onlyValidChar = (
strict ? validCharactersUnderStrictMode : validCharacters
).test(cleanSequence);
if (!onlyValidChar) {
return errorResponses.invalidSequence;
}
// Attempt to find what type is more likely
const likelyType: 'na' | 'aa' = findLikelyType(cleanSequence);
// Fail if type can't be detected
Iif (!likelyType) {
return errorResponses.invalidSequence;
}
return {
...validResponse,
likelyType,
sequence,
};
}
/**
* Main validation function
*
* @param {Array<string>|string} sequences - An array of sequences or a string
* @return {Array<object|null>} The result of validating each sequence
* while keeping order intact.
*/
function validateSequences(input: string | string[], minimumLength?: number) {
let sequences: string[] = [];
// Is this a string?
if (typeof input === 'string') {
// Is this a FASTA format?
if (isFASTA(input)) {
// This is important to happen here, otherwise you will get a result
// similar to [[a], [b]] instead of [a, b]
sequences = prepareFASTAString(input);
} else E{
// We only work with arrays, so create an array
sequences = [input];
}
} else {
sequences = [...input];
}
// This works based on the value of 'sequence', so keep it here instead of top
const invalidInputException = `Sequence Validation function expects an Array<string>|string, but received ${typeof sequences}`;
// Otherwise, make sure we have an array to work with
Iif (!Array.isArray(sequences)) {
throw new Error(invalidInputException);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
} else Iif (!Object.prototype.toString.call(sequences) === '[object Array]') {
throw new Error(invalidInputException);
}
// If the input is empty, return empty results
Iif (sequences.length === 0) {
return [];
}
// Validate each sequence separately, compile and return the results
return sequences.map((sequence) =>
sequenceValidator(sequence, minimumLength)
);
}
export default validateSequences;
|